Skip to content

Commit 4f2d6a1

Browse files
committed
[ET Device Support] Add device tensor helper functions to TensorPtr API
Add clone_tensor_ptr_to_device and clone_tensor_ptr_to_cpu to tensor_ptr.h for cloning tensors between host and device memory via DeviceAllocatorRegistry. Extend the existing make_tensor_ptr(const TensorPtr&, ...) overload with optional device_type/device_index parameters (default CPU/0) for seamless device placement. Differential Revision: [D99913077](https://our.internmc.facebook.com/intern/diff/D99913077/) ghstack-source-id: 364093618 Pull Request resolved: #18761
1 parent 8e700ab commit 4f2d6a1

5 files changed

Lines changed: 529 additions & 3 deletions

File tree

extension/tensor/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def define_common_targets():
2424
],
2525
visibility = ["PUBLIC"],
2626
deps = [
27+
"//executorch/runtime/core:device_allocator",
2728
"//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
2829
"//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
2930
],

extension/tensor/tensor_ptr.cpp

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <numeric>
1212

13+
#include <executorch/runtime/core/device_allocator.h>
1314
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
1415

1516
namespace executorch {
@@ -248,5 +249,179 @@ runtime::Error resize_tensor_ptr(
248249
sizes.data(), sizes.size()));
249250
}
250251

252+
// ---- Device tensor helpers ----
253+
254+
namespace {
255+
256+
#ifndef USE_ATEN_LIB
257+
struct DeviceStorage final {
258+
executorch::aten::TensorImpl tensor_impl;
259+
executorch::aten::Tensor tensor;
260+
std::vector<executorch::aten::SizesType> sizes;
261+
std::vector<executorch::aten::DimOrderType> dim_order;
262+
std::vector<executorch::aten::StridesType> strides;
263+
std::function<void(void*)> deleter;
264+
265+
DeviceStorage(
266+
executorch::aten::TensorImpl&& tensor_impl,
267+
std::vector<executorch::aten::SizesType>&& sizes,
268+
std::vector<executorch::aten::DimOrderType>&& dim_order,
269+
std::vector<executorch::aten::StridesType>&& strides,
270+
std::function<void(void*)>&& deleter)
271+
: tensor_impl(std::move(tensor_impl)),
272+
tensor(&this->tensor_impl),
273+
sizes(std::move(sizes)),
274+
dim_order(std::move(dim_order)),
275+
strides(std::move(strides)),
276+
deleter(std::move(deleter)) {}
277+
278+
~DeviceStorage() {
279+
if (deleter) {
280+
deleter(tensor_impl.mutable_data());
281+
}
282+
}
283+
};
284+
#endif // USE_ATEN_LIB
285+
286+
TensorPtr make_tensor_ptr_with_device(
287+
std::vector<executorch::aten::SizesType> sizes,
288+
void* data,
289+
executorch::aten::ScalarType type,
290+
runtime::etensor::DeviceType device_type,
291+
runtime::etensor::DeviceIndex device_index,
292+
std::function<void(void*)> deleter) {
293+
const auto dim = sizes.size();
294+
std::vector<executorch::aten::DimOrderType> dim_order(dim);
295+
std::iota(dim_order.begin(), dim_order.end(), 0);
296+
297+
std::vector<executorch::aten::StridesType> strides(dim);
298+
if (dim > 0) {
299+
auto error = runtime::dim_order_to_stride(
300+
sizes.data(), dim_order.data(), dim, strides.data());
301+
ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
302+
}
303+
304+
#ifndef USE_ATEN_LIB
305+
executorch::aten::TensorImpl tensor_impl(
306+
type,
307+
dim,
308+
sizes.data(),
309+
data,
310+
dim_order.data(),
311+
strides.data(),
312+
dim > 0 ? executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND
313+
: executorch::aten::TensorShapeDynamism::STATIC,
314+
device_type,
315+
device_index);
316+
auto storage = std::make_shared<DeviceStorage>(
317+
std::move(tensor_impl),
318+
std::move(sizes),
319+
std::move(dim_order),
320+
std::move(strides),
321+
std::move(deleter));
322+
const auto tensor_ptr = &storage->tensor;
323+
return std::shared_ptr<executorch::aten::Tensor>(
324+
std::move(storage), tensor_ptr);
325+
#else
326+
(void)device_type;
327+
(void)device_index;
328+
return make_tensor_ptr(
329+
std::move(sizes),
330+
data,
331+
type,
332+
executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
333+
std::move(deleter));
334+
#endif // USE_ATEN_LIB
335+
}
336+
337+
} // namespace
338+
339+
TensorPtr clone_tensor_ptr_to_device(
340+
const TensorPtr& cpu_tensor,
341+
runtime::etensor::DeviceType device_type,
342+
runtime::etensor::DeviceIndex device_index) {
343+
ET_CHECK_MSG(
344+
device_type != runtime::etensor::DeviceType::CPU,
345+
"Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");
346+
347+
auto* allocator = runtime::get_device_allocator(device_type);
348+
ET_CHECK_MSG(
349+
allocator != nullptr,
350+
"No device allocator registered for device type %d",
351+
static_cast<int>(device_type));
352+
353+
const auto nbytes = cpu_tensor->nbytes();
354+
const auto* cpu_data = cpu_tensor->const_data_ptr();
355+
ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");
356+
357+
auto result = allocator->allocate(nbytes, device_index);
358+
ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
359+
void* device_data = result.get();
360+
361+
auto err = allocator->copy_host_to_device(
362+
device_data, cpu_data, nbytes, device_index);
363+
ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");
364+
365+
std::vector<executorch::aten::SizesType> sizes(
366+
cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());
367+
368+
return make_tensor_ptr_with_device(
369+
std::move(sizes),
370+
device_data,
371+
cpu_tensor->scalar_type(),
372+
device_type,
373+
device_index,
374+
[allocator, device_index](void* ptr) {
375+
allocator->deallocate(ptr, device_index);
376+
});
377+
}
378+
379+
TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
380+
const auto nbytes = device_tensor->nbytes();
381+
const auto* device_data = device_tensor->const_data_ptr();
382+
ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");
383+
384+
#ifndef USE_ATEN_LIB
385+
const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type();
386+
const auto device_index =
387+
device_tensor->unsafeGetTensorImpl()->device_index();
388+
#else
389+
const auto& aten_device = device_tensor->device();
390+
ET_CHECK_MSG(!aten_device.is_cpu(), "Source tensor is already on CPU.");
391+
auto device_type = runtime::etensor::DeviceType::CPU;
392+
if (aten_device.is_cuda()) {
393+
device_type = runtime::etensor::DeviceType::CUDA;
394+
}
395+
const auto device_index =
396+
static_cast<runtime::etensor::DeviceIndex>(aten_device.index());
397+
#endif
398+
399+
ET_CHECK_MSG(
400+
device_type != runtime::etensor::DeviceType::CPU,
401+
"Source tensor is already on CPU.");
402+
403+
auto* allocator = runtime::get_device_allocator(device_type);
404+
ET_CHECK_MSG(
405+
allocator != nullptr,
406+
"No device allocator registered for device type %d",
407+
static_cast<int>(device_type));
408+
409+
std::vector<uint8_t> cpu_data(nbytes);
410+
411+
auto err = allocator->copy_device_to_host(
412+
cpu_data.data(), device_data, nbytes, device_index);
413+
ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");
414+
415+
std::vector<executorch::aten::SizesType> sizes(
416+
device_tensor->sizes().begin(), device_tensor->sizes().end());
417+
418+
return make_tensor_ptr(
419+
std::move(sizes),
420+
std::move(cpu_data),
421+
{},
422+
{},
423+
device_tensor->scalar_type());
424+
}
425+
251426
} // namespace extension
252427
} // namespace executorch

extension/tensor/tensor_ptr.h

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <executorch/runtime/core/error.h>
1818
#include <executorch/runtime/core/exec_aten/exec_aten.h>
1919
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
20+
#include <executorch/runtime/core/portable_type/device.h>
2021

2122
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
2223

@@ -388,27 +389,59 @@ inline TensorPtr make_tensor_ptr(
388389
std::move(deleter));
389390
}
390391

392+
/**
393+
* Clones a CPU TensorPtr to a device TensorPtr.
394+
*
395+
* Allocates memory on the specified device and copies the tensor data from
396+
* host to device using the DeviceAllocator registered for the given device
397+
* type. The returned TensorPtr owns the device memory and will free it via
398+
* the allocator when destroyed.
399+
*
400+
* Forward declaration to support make_tensor_ptr below usage.
401+
*
402+
* @param cpu_tensor The source CPU tensor whose data will be copied.
403+
* @param device_type The target device type (e.g., DeviceType::CUDA).
404+
* @param device_index The target device index (default 0).
405+
* @return A TensorPtr backed by device memory containing the copied data.
406+
*/
407+
TensorPtr clone_tensor_ptr_to_device(
408+
const TensorPtr& cpu_tensor,
409+
runtime::etensor::DeviceType device_type,
410+
runtime::etensor::DeviceIndex device_index = 0);
411+
391412
/**
392413
* Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
393414
* Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
415+
* When device_type is not CPU, the tensor data is additionally copied to the
416+
* specified device.
394417
*
395418
* @param tensor_ptr The source tensor pointer to alias.
396419
* @param sizes Optional sizes override.
397420
* @param dim_order Optional dimension order override.
398421
* @param strides Optional strides override.
399-
* @return A TensorPtr aliasing the same storage with requested metadata.
422+
* @param device_type The target device type (default CPU, meaning no copy).
423+
* @param device_index The target device index (default 0).
424+
* @return A TensorPtr aliasing the same storage with requested metadata, or a
425+
* device TensorPtr if device_type is not CPU.
400426
*/
401427
inline TensorPtr make_tensor_ptr(
402428
const TensorPtr& tensor_ptr,
403429
std::vector<executorch::aten::SizesType> sizes = {},
404430
std::vector<executorch::aten::DimOrderType> dim_order = {},
405-
std::vector<executorch::aten::StridesType> strides = {}) {
406-
return make_tensor_ptr(
431+
std::vector<executorch::aten::StridesType> strides = {},
432+
runtime::etensor::DeviceType device_type =
433+
runtime::etensor::DeviceType::CPU,
434+
runtime::etensor::DeviceIndex device_index = 0) {
435+
auto result = make_tensor_ptr(
407436
*tensor_ptr,
408437
std::move(sizes),
409438
std::move(dim_order),
410439
std::move(strides),
411440
[tensor_ptr](void*) {});
441+
if (device_type != runtime::etensor::DeviceType::CPU) {
442+
return clone_tensor_ptr_to_device(result, device_type, device_index);
443+
}
444+
return result;
412445
}
413446

414447
/**
@@ -479,6 +512,18 @@ runtime::Error resize_tensor_ptr(
479512
TensorPtr& tensor,
480513
const std::vector<executorch::aten::SizesType>& sizes);
481514

515+
/**
516+
* Clones a device TensorPtr to a CPU TensorPtr.
517+
*
518+
* Allocates host memory and copies the tensor data from device to host using
519+
* the DeviceAllocator registered for the source tensor's device type. The
520+
* device type is determined from the source tensor's metadata.
521+
*
522+
* @param device_tensor The source device tensor whose data will be copied.
523+
* @return A TensorPtr backed by CPU memory containing the copied data.
524+
*/
525+
TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor);
526+
482527
} // namespace extension
483528
} // namespace executorch
484529

extension/tensor/test/targets.bzl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,14 @@ def define_common_targets():
2121
"//executorch/extension/tensor:tensor" + aten_suffix,
2222
],
2323
)
24+
25+
runtime.cxx_test(
26+
name = "tensor_ptr_device_test" + aten_suffix,
27+
srcs = [
28+
"tensor_ptr_device_test.cpp",
29+
],
30+
deps = [
31+
"//executorch/extension/tensor:tensor" + aten_suffix,
32+
"//executorch/runtime/core:device_allocator",
33+
],
34+
)

0 commit comments

Comments
 (0)