Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions extension/tensor/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def define_common_targets():
],
visibility = ["PUBLIC"],
deps = [
"//executorch/runtime/core:device_allocator",
"//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
"//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
],
Expand Down
201 changes: 194 additions & 7 deletions extension/tensor/tensor_ptr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <numeric>

#include <executorch/runtime/core/device_allocator.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>

namespace executorch {
Expand Down Expand Up @@ -61,7 +62,9 @@ TensorPtr make_tensor_ptr(
std::vector<executorch::aten::StridesType> strides,
executorch::aten::ScalarType type,
executorch::aten::TensorShapeDynamism dynamism,
std::function<void(void*)> deleter) {
std::function<void(void*)> deleter,
runtime::etensor::DeviceType device_type,
runtime::etensor::DeviceIndex device_index) {
const auto dim = sizes.size();
ET_CHECK_MSG(
dim_order.empty() || dim_order.size() == dim,
Expand Down Expand Up @@ -101,6 +104,7 @@ TensorPtr make_tensor_ptr(

strides = std::move(computed_strides);

TensorPtr cpu_tensor;
#ifndef USE_ATEN_LIB
executorch::aten::TensorImpl tensor_impl(
type,
Expand All @@ -116,9 +120,9 @@ TensorPtr make_tensor_ptr(
std::move(dim_order),
std::move(strides),
std::move(deleter));
const auto tensor_ptr = &storage->tensor;
return std::shared_ptr<executorch::aten::Tensor>(
std::move(storage), tensor_ptr);
const auto raw_tensor_ptr = &storage->tensor;
cpu_tensor = std::shared_ptr<executorch::aten::Tensor>(
std::move(storage), raw_tensor_ptr);
#else
auto options = c10::TensorOptions()
.dtype(c10::scalarTypeToTypeMeta(type))
Expand All @@ -136,8 +140,13 @@ TensorPtr make_tensor_ptr(
c10::DispatchKeySet(c10::DispatchKey::CPU),
options.dtype());
tensor_impl->set_sizes_and_strides(sizes, strides);
return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
cpu_tensor =
std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
#endif // USE_ATEN_LIB
if (device_type != runtime::etensor::DeviceType::CPU) {
return clone_tensor_ptr_to_device(cpu_tensor, device_type, device_index);
}
return cpu_tensor;
}

TensorPtr make_tensor_ptr(
Expand All @@ -146,7 +155,9 @@ TensorPtr make_tensor_ptr(
std::vector<executorch::aten::DimOrderType> dim_order,
std::vector<executorch::aten::StridesType> strides,
executorch::aten::ScalarType type,
executorch::aten::TensorShapeDynamism dynamism) {
executorch::aten::TensorShapeDynamism dynamism,
runtime::etensor::DeviceType device_type,
runtime::etensor::DeviceIndex device_index) {
ET_CHECK_MSG(
data.size() ==
executorch::aten::compute_numel(sizes.data(), sizes.size()) *
Expand All @@ -161,7 +172,9 @@ TensorPtr make_tensor_ptr(
type,
dynamism,
// Data is moved into the deleter and is destroyed together with Storage.
[data = std::move(data)](void*) {});
[data = std::move(data)](void*) {},
device_type,
device_index);
}

TensorPtr clone_tensor_ptr(
Expand Down Expand Up @@ -248,5 +261,179 @@ runtime::Error resize_tensor_ptr(
sizes.data(), sizes.size()));
}

// ---- Device tensor helpers ----

namespace {

#ifndef USE_ATEN_LIB
struct DeviceStorage final {
executorch::aten::TensorImpl tensor_impl;
executorch::aten::Tensor tensor;
std::vector<executorch::aten::SizesType> sizes;
std::vector<executorch::aten::DimOrderType> dim_order;
std::vector<executorch::aten::StridesType> strides;
std::function<void(void*)> deleter;

DeviceStorage(
executorch::aten::TensorImpl&& tensor_impl,
std::vector<executorch::aten::SizesType>&& sizes,
std::vector<executorch::aten::DimOrderType>&& dim_order,
std::vector<executorch::aten::StridesType>&& strides,
std::function<void(void*)>&& deleter)
: tensor_impl(std::move(tensor_impl)),
tensor(&this->tensor_impl),
sizes(std::move(sizes)),
dim_order(std::move(dim_order)),
strides(std::move(strides)),
deleter(std::move(deleter)) {}

~DeviceStorage() {
if (deleter) {
deleter(tensor_impl.mutable_data());
}
}
};
#endif // USE_ATEN_LIB

TensorPtr make_tensor_ptr_with_device(
std::vector<executorch::aten::SizesType> sizes,
void* data,
executorch::aten::ScalarType type,
runtime::etensor::DeviceType device_type,
runtime::etensor::DeviceIndex device_index,
std::function<void(void*)> deleter) {
const auto dim = sizes.size();
std::vector<executorch::aten::DimOrderType> dim_order(dim);
std::iota(dim_order.begin(), dim_order.end(), 0);

std::vector<executorch::aten::StridesType> strides(dim);
if (dim > 0) {
auto error = runtime::dim_order_to_stride(
sizes.data(), dim_order.data(), dim, strides.data());
ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides.");
}

#ifndef USE_ATEN_LIB
executorch::aten::TensorImpl tensor_impl(
type,
dim,
sizes.data(),
data,
dim_order.data(),
strides.data(),
dim > 0 ? executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND
: executorch::aten::TensorShapeDynamism::STATIC,
device_type,
device_index);
auto storage = std::make_shared<DeviceStorage>(
std::move(tensor_impl),
std::move(sizes),
std::move(dim_order),
std::move(strides),
std::move(deleter));
const auto tensor_ptr = &storage->tensor;
return std::shared_ptr<executorch::aten::Tensor>(
std::move(storage), tensor_ptr);
#else
(void)device_type;
(void)device_index;
return make_tensor_ptr(
std::move(sizes),
data,
type,
executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
std::move(deleter));
#endif // USE_ATEN_LIB
}

} // namespace

TensorPtr clone_tensor_ptr_to_device(
const TensorPtr& cpu_tensor,
runtime::etensor::DeviceType device_type,
runtime::etensor::DeviceIndex device_index) {
ET_CHECK_MSG(
device_type != runtime::etensor::DeviceType::CPU,
"Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");

auto* allocator = runtime::get_device_allocator(device_type);
ET_CHECK_MSG(
allocator != nullptr,
"No device allocator registered for device type %d",
static_cast<int>(device_type));

const auto nbytes = cpu_tensor->nbytes();
const auto* cpu_data = cpu_tensor->const_data_ptr();
ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");

auto result = allocator->allocate(nbytes, device_index);
ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
void* device_data = result.get();

auto err = allocator->copy_host_to_device(
device_data, cpu_data, nbytes, device_index);
ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");

std::vector<executorch::aten::SizesType> sizes(
cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());

return make_tensor_ptr_with_device(
std::move(sizes),
device_data,
cpu_tensor->scalar_type(),
device_type,
device_index,
[allocator, device_index](void* ptr) {
allocator->deallocate(ptr, device_index);
});
}

TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
const auto nbytes = device_tensor->nbytes();
const auto* device_data = device_tensor->const_data_ptr();
ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");

#ifndef USE_ATEN_LIB
const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type();
const auto device_index =
device_tensor->unsafeGetTensorImpl()->device_index();
#else
const auto& aten_device = device_tensor->device();
ET_CHECK_MSG(!aten_device.is_cpu(), "Source tensor is already on CPU.");
auto device_type = runtime::etensor::DeviceType::CPU;
if (aten_device.is_cuda()) {
device_type = runtime::etensor::DeviceType::CUDA;
}
const auto device_index =
static_cast<runtime::etensor::DeviceIndex>(aten_device.index());
#endif

ET_CHECK_MSG(
device_type != runtime::etensor::DeviceType::CPU,
"Source tensor is already on CPU.");

auto* allocator = runtime::get_device_allocator(device_type);
ET_CHECK_MSG(
allocator != nullptr,
"No device allocator registered for device type %d",
static_cast<int>(device_type));

std::vector<uint8_t> cpu_data(nbytes);

auto err = allocator->copy_device_to_host(
cpu_data.data(), device_data, nbytes, device_index);
ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");

std::vector<executorch::aten::SizesType> sizes(
device_tensor->sizes().begin(), device_tensor->sizes().end());

return make_tensor_ptr(
std::move(sizes),
std::move(cpu_data),
{},
{},
device_tensor->scalar_type());
}

} // namespace extension
} // namespace executorch
Loading
Loading