CodeLinaro
diff --git a/‎extension/tensor/targets.bzl‎
Lines changed: 1 addition & 0 deletions b/‎extension/tensor/targets.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/tensor/tensor_ptr.cpp‎
Lines changed: 116 additions & 6 deletions b/‎extension/tensor/tensor_ptr.cpp‎
Lines changed: 116 additions & 6 deletions
@@ -24,6 +24,7 @@ def define_common_targets():
             ],
             visibility = ["PUBLIC"],
             deps = [
+                "//executorch/runtime/core:device_allocator",
                 "//executorch/runtime/core/exec_aten/util:dim_order_util" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
             ],
 
@@ -12,6 +12,9 @@
 
 #include <c10/util/safe_numerics.h>
 
+#ifndef USE_ATEN_LIB
+#include <executorch/runtime/core/device_allocator.h>
+#endif // USE_ATEN_LIB
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 namespace executorch {
@@ -25,6 +28,9 @@ namespace {
  * ensures that they are managed together and have the same lifetime as the
  * Tensor. When the Tensor is destroyed, the Storage structure ensures
  * proper cleanup of the associated metadata and data if needed.
+ *
+ * For device tensors, the data pointer points to device memory; the deleter
+ * is responsible for freeing it through the appropriate DeviceAllocator.
  */
 struct Storage final {
   executorch::aten::TensorImpl tensor_impl;
@@ -47,6 +53,11 @@ struct Storage final {
         strides(std::move(strides)),
         deleter(std::move(deleter)) {}
 
+  Storage(const Storage&) = delete;
+  Storage& operator=(const Storage&) = delete;
+  Storage(Storage&&) = delete;
+  Storage& operator=(Storage&&) = delete;
+
   ~Storage() {
     if (deleter) {
       deleter(tensor_impl.mutable_data());
@@ -63,7 +74,8 @@ TensorPtr make_tensor_ptr(
     std::vector<executorch::aten::StridesType> strides,
     executorch::aten::ScalarType type,
     executorch::aten::TensorShapeDynamism dynamism,
-    std::function<void(void*)> deleter) {
+    std::function<void(void*)> deleter,
+    executorch::aten::Device device) {
   const auto dim = sizes.size();
   ET_CHECK_MSG(
       dim_order.empty() || dim_order.size() == dim,
@@ -111,20 +123,22 @@ TensorPtr make_tensor_ptr(
       data,
       dim_order.data(),
       strides.data(),
-      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC);
+      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC,
+      device.type(),
+      device.index());
   auto storage = std::make_shared<Storage>(
       std::move(tensor_impl),
       std::move(sizes),
       std::move(dim_order),
       std::move(strides),
       std::move(deleter));
-  const auto tensor_ptr = &storage->tensor;
+  const auto raw_tensor_ptr = &storage->tensor;
   return std::shared_ptr<executorch::aten::Tensor>(
-      std::move(storage), tensor_ptr);
+      std::move(storage), raw_tensor_ptr);
 #else
   auto options = c10::TensorOptions()
                      .dtype(c10::scalarTypeToTypeMeta(type))
-                     .device(c10::kCPU);
+                     .device(device);
   auto storage = c10::Storage(
       c10::Storage::use_byte_size_t(),
       at::detail::computeStorageNbytes(
@@ -135,7 +149,7 @@ TensorPtr make_tensor_ptr(
       false);
   auto tensor_impl = c10::make_intrusive<executorch::aten::TensorImpl>(
       std::move(storage),
-      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      c10::DispatchKeySet(options.computeDispatchKey()),
       options.dtype());
   tensor_impl->set_sizes_and_strides(sizes, strides);
   return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
@@ -271,5 +285,101 @@ runtime::Error resize_tensor_ptr(
           sizes.data(), sizes.size()));
 }
 
+// ---- Device tensor helpers ----
+//
+// These helpers rely on the ExecuTorch DeviceAllocator and the portable tensor
+// metadata APIs (dim_order, shape_dynamism, device), which have no equivalent
+// in USE_ATEN_LIB builds, so they are compiled out there.
+
+#ifndef USE_ATEN_LIB
+
+TensorPtr clone_tensor_ptr_to_device(
+    const TensorPtr& cpu_tensor,
+    executorch::aten::Device device) {
+  ET_CHECK_MSG(
+      cpu_tensor->device().is_cpu(),
+      "Source tensor must reside on CPU; got device type %d.",
+      static_cast<int>(cpu_tensor->device_type()));
+
+  ET_CHECK_MSG(
+      !device.is_cpu(),
+      "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies.");
+
+  auto* allocator = runtime::get_device_allocator(device.type());
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device.type()));
+
+  const auto nbytes = cpu_tensor->nbytes();
+  const auto* cpu_data = cpu_tensor->const_data_ptr();
+  ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data.");
+
+  auto result = allocator->allocate(nbytes, device.index());
+  ET_CHECK_MSG(result.ok(), "Failed to allocate device memory.");
+  void* device_data = result.get();
+
+  auto err = allocator->copy_host_to_device(
+      device_data, cpu_data, nbytes, device.index());
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      cpu_tensor->sizes().begin(), cpu_tensor->sizes().end());
+  std::vector<executorch::aten::DimOrderType> dim_order(
+      cpu_tensor->dim_order().begin(), cpu_tensor->dim_order().end());
+  std::vector<executorch::aten::StridesType> strides(
+      cpu_tensor->strides().begin(), cpu_tensor->strides().end());
+
+  return make_tensor_ptr(
+      std::move(sizes),
+      device_data,
+      std::move(dim_order),
+      std::move(strides),
+      cpu_tensor->scalar_type(),
+      cpu_tensor->shape_dynamism(),
+      [allocator, device](void* ptr) {
+        allocator->deallocate(ptr, device.index());
+      },
+      device);
+}
+
+TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) {
+  const auto nbytes = device_tensor->nbytes();
+  const auto* device_data = device_tensor->const_data_ptr();
+  ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data.");
+
+  const auto device = device_tensor->device();
+  ET_CHECK_MSG(!device.is_cpu(), "Source tensor is already on CPU.");
+
+  auto* allocator = runtime::get_device_allocator(device.type());
+  ET_CHECK_MSG(
+      allocator != nullptr,
+      "No device allocator registered for device type %d",
+      static_cast<int>(device.type()));
+
+  std::vector<uint8_t> cpu_data(nbytes);
+
+  auto err = allocator->copy_device_to_host(
+      cpu_data.data(), device_data, nbytes, device.index());
+  ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed.");
+
+  std::vector<executorch::aten::SizesType> sizes(
+      device_tensor->sizes().begin(), device_tensor->sizes().end());
+  std::vector<executorch::aten::DimOrderType> dim_order(
+      device_tensor->dim_order().begin(), device_tensor->dim_order().end());
+  std::vector<executorch::aten::StridesType> strides(
+      device_tensor->strides().begin(), device_tensor->strides().end());
+
+  return make_tensor_ptr(
+      std::move(sizes),
+      std::move(cpu_data),
+      std::move(dim_order),
+      std::move(strides),
+      device_tensor->scalar_type(),
+      device_tensor->shape_dynamism());
+}
+
+#endif // USE_ATEN_LIB
+
 } // namespace extension
 } // namespace executorch