|
10 | 10 |
|
11 | 11 | #include <numeric> |
12 | 12 |
|
| 13 | +#include <executorch/runtime/core/device_allocator.h> |
13 | 14 | #include <executorch/runtime/core/exec_aten/util/tensor_util.h> |
14 | 15 |
|
15 | 16 | namespace executorch { |
@@ -248,5 +249,179 @@ runtime::Error resize_tensor_ptr( |
248 | 249 | sizes.data(), sizes.size())); |
249 | 250 | } |
250 | 251 |
|
| 252 | +// ---- Device tensor helpers ---- |
| 253 | + |
| 254 | +namespace { |
| 255 | + |
| 256 | +#ifndef USE_ATEN_LIB |
| 257 | +struct DeviceStorage final { |
| 258 | + executorch::aten::TensorImpl tensor_impl; |
| 259 | + executorch::aten::Tensor tensor; |
| 260 | + std::vector<executorch::aten::SizesType> sizes; |
| 261 | + std::vector<executorch::aten::DimOrderType> dim_order; |
| 262 | + std::vector<executorch::aten::StridesType> strides; |
| 263 | + std::function<void(void*)> deleter; |
| 264 | + |
| 265 | + DeviceStorage( |
| 266 | + executorch::aten::TensorImpl&& tensor_impl, |
| 267 | + std::vector<executorch::aten::SizesType>&& sizes, |
| 268 | + std::vector<executorch::aten::DimOrderType>&& dim_order, |
| 269 | + std::vector<executorch::aten::StridesType>&& strides, |
| 270 | + std::function<void(void*)>&& deleter) |
| 271 | + : tensor_impl(std::move(tensor_impl)), |
| 272 | + tensor(&this->tensor_impl), |
| 273 | + sizes(std::move(sizes)), |
| 274 | + dim_order(std::move(dim_order)), |
| 275 | + strides(std::move(strides)), |
| 276 | + deleter(std::move(deleter)) {} |
| 277 | + |
| 278 | + ~DeviceStorage() { |
| 279 | + if (deleter) { |
| 280 | + deleter(tensor_impl.mutable_data()); |
| 281 | + } |
| 282 | + } |
| 283 | +}; |
| 284 | +#endif // USE_ATEN_LIB |
| 285 | + |
| 286 | +TensorPtr make_tensor_ptr_with_device( |
| 287 | + std::vector<executorch::aten::SizesType> sizes, |
| 288 | + void* data, |
| 289 | + executorch::aten::ScalarType type, |
| 290 | + runtime::etensor::DeviceType device_type, |
| 291 | + runtime::etensor::DeviceIndex device_index, |
| 292 | + std::function<void(void*)> deleter) { |
| 293 | + const auto dim = sizes.size(); |
| 294 | + std::vector<executorch::aten::DimOrderType> dim_order(dim); |
| 295 | + std::iota(dim_order.begin(), dim_order.end(), 0); |
| 296 | + |
| 297 | + std::vector<executorch::aten::StridesType> strides(dim); |
| 298 | + if (dim > 0) { |
| 299 | + auto error = runtime::dim_order_to_stride( |
| 300 | + sizes.data(), dim_order.data(), dim, strides.data()); |
| 301 | + ET_CHECK_MSG(error == runtime::Error::Ok, "Failed to compute strides."); |
| 302 | + } |
| 303 | + |
| 304 | +#ifndef USE_ATEN_LIB |
| 305 | + executorch::aten::TensorImpl tensor_impl( |
| 306 | + type, |
| 307 | + dim, |
| 308 | + sizes.data(), |
| 309 | + data, |
| 310 | + dim_order.data(), |
| 311 | + strides.data(), |
| 312 | + dim > 0 ? executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND |
| 313 | + : executorch::aten::TensorShapeDynamism::STATIC, |
| 314 | + device_type, |
| 315 | + device_index); |
| 316 | + auto storage = std::make_shared<DeviceStorage>( |
| 317 | + std::move(tensor_impl), |
| 318 | + std::move(sizes), |
| 319 | + std::move(dim_order), |
| 320 | + std::move(strides), |
| 321 | + std::move(deleter)); |
| 322 | + const auto tensor_ptr = &storage->tensor; |
| 323 | + return std::shared_ptr<executorch::aten::Tensor>( |
| 324 | + std::move(storage), tensor_ptr); |
| 325 | +#else |
| 326 | + (void)device_type; |
| 327 | + (void)device_index; |
| 328 | + return make_tensor_ptr( |
| 329 | + std::move(sizes), |
| 330 | + data, |
| 331 | + type, |
| 332 | + executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND, |
| 333 | + std::move(deleter)); |
| 334 | +#endif // USE_ATEN_LIB |
| 335 | +} |
| 336 | + |
| 337 | +} // namespace |
| 338 | + |
| 339 | +TensorPtr clone_tensor_ptr_to_device( |
| 340 | + const TensorPtr& cpu_tensor, |
| 341 | + runtime::etensor::DeviceType device_type, |
| 342 | + runtime::etensor::DeviceIndex device_index) { |
| 343 | + ET_CHECK_MSG( |
| 344 | + device_type != runtime::etensor::DeviceType::CPU, |
| 345 | + "Target device must not be CPU; use clone_tensor_ptr for CPU-to-CPU copies."); |
| 346 | + |
| 347 | + auto* allocator = runtime::get_device_allocator(device_type); |
| 348 | + ET_CHECK_MSG( |
| 349 | + allocator != nullptr, |
| 350 | + "No device allocator registered for device type %d", |
| 351 | + static_cast<int>(device_type)); |
| 352 | + |
| 353 | + const auto nbytes = cpu_tensor->nbytes(); |
| 354 | + const auto* cpu_data = cpu_tensor->const_data_ptr(); |
| 355 | + ET_CHECK_MSG(cpu_data != nullptr, "Source tensor has no data."); |
| 356 | + |
| 357 | + auto result = allocator->allocate(nbytes, device_index); |
| 358 | + ET_CHECK_MSG(result.ok(), "Failed to allocate device memory."); |
| 359 | + void* device_data = result.get(); |
| 360 | + |
| 361 | + auto err = allocator->copy_host_to_device( |
| 362 | + device_data, cpu_data, nbytes, device_index); |
| 363 | + ET_CHECK_MSG(err == runtime::Error::Ok, "Host-to-device copy failed."); |
| 364 | + |
| 365 | + std::vector<executorch::aten::SizesType> sizes( |
| 366 | + cpu_tensor->sizes().begin(), cpu_tensor->sizes().end()); |
| 367 | + |
| 368 | + return make_tensor_ptr_with_device( |
| 369 | + std::move(sizes), |
| 370 | + device_data, |
| 371 | + cpu_tensor->scalar_type(), |
| 372 | + device_type, |
| 373 | + device_index, |
| 374 | + [allocator, device_index](void* ptr) { |
| 375 | + allocator->deallocate(ptr, device_index); |
| 376 | + }); |
| 377 | +} |
| 378 | + |
| 379 | +TensorPtr clone_tensor_ptr_to_cpu(const TensorPtr& device_tensor) { |
| 380 | + const auto nbytes = device_tensor->nbytes(); |
| 381 | + const auto* device_data = device_tensor->const_data_ptr(); |
| 382 | + ET_CHECK_MSG(device_data != nullptr, "Source device tensor has no data."); |
| 383 | + |
| 384 | +#ifndef USE_ATEN_LIB |
| 385 | + const auto device_type = device_tensor->unsafeGetTensorImpl()->device_type(); |
| 386 | + const auto device_index = |
| 387 | + device_tensor->unsafeGetTensorImpl()->device_index(); |
| 388 | +#else |
| 389 | + const auto& aten_device = device_tensor->device(); |
| 390 | + ET_CHECK_MSG(!aten_device.is_cpu(), "Source tensor is already on CPU."); |
| 391 | + auto device_type = runtime::etensor::DeviceType::CPU; |
| 392 | + if (aten_device.is_cuda()) { |
| 393 | + device_type = runtime::etensor::DeviceType::CUDA; |
| 394 | + } |
| 395 | + const auto device_index = |
| 396 | + static_cast<runtime::etensor::DeviceIndex>(aten_device.index()); |
| 397 | +#endif |
| 398 | + |
| 399 | + ET_CHECK_MSG( |
| 400 | + device_type != runtime::etensor::DeviceType::CPU, |
| 401 | + "Source tensor is already on CPU."); |
| 402 | + |
| 403 | + auto* allocator = runtime::get_device_allocator(device_type); |
| 404 | + ET_CHECK_MSG( |
| 405 | + allocator != nullptr, |
| 406 | + "No device allocator registered for device type %d", |
| 407 | + static_cast<int>(device_type)); |
| 408 | + |
| 409 | + std::vector<uint8_t> cpu_data(nbytes); |
| 410 | + |
| 411 | + auto err = allocator->copy_device_to_host( |
| 412 | + cpu_data.data(), device_data, nbytes, device_index); |
| 413 | + ET_CHECK_MSG(err == runtime::Error::Ok, "Device-to-host copy failed."); |
| 414 | + |
| 415 | + std::vector<executorch::aten::SizesType> sizes( |
| 416 | + device_tensor->sizes().begin(), device_tensor->sizes().end()); |
| 417 | + |
| 418 | + return make_tensor_ptr( |
| 419 | + std::move(sizes), |
| 420 | + std::move(cpu_data), |
| 421 | + {}, |
| 422 | + {}, |
| 423 | + device_tensor->scalar_type()); |
| 424 | +} |
| 425 | + |
251 | 426 | } // namespace extension |
252 | 427 | } // namespace executorch |
0 commit comments