99#pragma once
1010
1111#include < cuda_runtime.h>
12+ #include < executorch/backends/aoti/slim/c10/cuda/Exception.h>
1213#include < executorch/runtime/core/error.h>
1314#include < executorch/runtime/core/exec_aten/exec_aten.h>
1415#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -86,42 +87,85 @@ inline executorch::runtime::Error _check_tensor_metadata(
8687
8788 return executorch::runtime::Error::Ok;
8889}
89- } // namespace
90+ // Check if src and dst strides match (same layout, no rearrangement needed).
91+ inline bool _strides_match (
92+ const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
93+ const executorch::runtime::etensor::Tensor* etensor) {
94+ const size_t ndim = slim_tensor->dim ();
95+ auto slim_strides = slim_tensor->strides ();
96+ auto et_strides = etensor->strides ();
97+ for (size_t i = 0 ; i < ndim; ++i) {
98+ if (slim_strides[i] != static_cast <int64_t >(et_strides[i])) {
99+ return false ;
100+ }
101+ }
102+ return true ;
103+ }
90104
91- /* *
92- * Copies data from a SlimTensor to an ETensor asynchronously.
93- *
94- * This function converts a SlimTensor back to an ETensor using async copy.
95- * The ETensor is assumed to always reside on CPU, so this handles both
96- * CPU→CPU and GPU→CPU copies. The function will resize the ETensor if needed
97- * and copy the data asynchronously on the provided CUDA stream.
98- *
99- * NOTE: The caller must ensure proper synchronization after calling this
100- * function if the ETensor data is accessed on the CPU side.
101- *
102- * @param slim_tensor Pointer to the source SlimTensor (must not be null).
103- * @param etensor Pointer to the destination ETensor (must not be null).
104- * @param stream The CUDA stream to use for async copy.
105- * @return Error::Ok on success, or an appropriate error code on failure.
106- */
107- inline executorch::runtime::Error copy_slimtensor_to_etensor_async (
105+ // Element-by-element copy from a contiguous CPU buffer (src_strides layout)
106+ // to an ETensor (dst_strides layout), rearranging data to match.
107+ inline void _strided_copy (
108+ void * dst,
109+ const void * src,
110+ size_t elem_size,
111+ const std::vector<int64_t >& sizes,
112+ const std::vector<int64_t >& src_strides,
113+ const std::vector<int32_t >& dst_strides) {
114+ const size_t ndim = sizes.size ();
115+ const size_t numel = [&]() {
116+ size_t n = 1 ;
117+ for (auto s : sizes)
118+ n *= static_cast <size_t >(s);
119+ return n;
120+ }();
121+
122+ // Iterate over all elements using N-dimensional index
123+ std::vector<int64_t > idx (ndim, 0 );
124+ auto * dst_bytes = static_cast <char *>(dst);
125+ auto * src_bytes = static_cast <const char *>(src);
126+
127+ for (size_t i = 0 ; i < numel; ++i) {
128+ // Compute source and destination byte offsets
129+ size_t src_offset = 0 , dst_offset = 0 ;
130+ for (size_t d = 0 ; d < ndim; ++d) {
131+ src_offset += static_cast <size_t >(idx[d]) *
132+ static_cast <size_t >(src_strides[d]) * elem_size;
133+ dst_offset += static_cast <size_t >(idx[d]) *
134+ static_cast <size_t >(dst_strides[d]) * elem_size;
135+ }
136+ std::memcpy (dst_bytes + dst_offset, src_bytes + src_offset, elem_size);
137+
138+ // Increment N-dimensional index (last dimension fastest)
139+ for (int d = static_cast <int >(ndim) - 1 ; d >= 0 ; --d) {
140+ if (++idx[d] < sizes[d])
141+ break ;
142+ idx[d] = 0 ;
143+ }
144+ }
145+ }
146+
147+ // Copy data from SlimTensor to ETensor, rearranging if strides differ.
148+ // When stream is non-null, GPU copies use that stream (async fast path).
149+ // When stream is null, GPU copies are synchronous.
150+ inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl (
108151 const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
109152 executorch::runtime::etensor::Tensor* etensor,
110153 cudaStream_t stream) {
111- _check_tensor_metadata (slim_tensor, etensor);
154+ ET_CHECK_OK_OR_RETURN_ERROR ( _check_tensor_metadata (slim_tensor, etensor) );
112155
113- // Copy data from SlimTensor to ETensor
114- // SlimTensor may be on GPU or CPU, ETensor is always on CPU
115156 size_t nbytes = slim_tensor->nbytes ();
116- if (nbytes > 0 ) {
117- void * dst_data = etensor->mutable_data_ptr ();
118- const void * src_data = slim_tensor->data_ptr ();
157+ if (nbytes == 0 ) {
158+ return executorch::runtime::Error::Ok;
159+ }
160+
161+ void * dst_data = etensor->mutable_data_ptr ();
162+ const void * src_data = slim_tensor->data_ptr ();
119163
164+ if (_strides_match (slim_tensor, etensor)) {
165+ // Fast path: strides match, raw byte copy
120166 if (slim_tensor->is_cpu ()) {
121- // CPU → CPU copy (always synchronous)
122167 std::memcpy (dst_data, src_data, nbytes);
123- } else {
124- // GPU → CPU async copy
168+ } else if (stream) {
125169 executorch::backends::aoti::slim::DeviceTraits<
126170 executorch::backends::aoti::slim::c10::DeviceType::CUDA >::
127171 memcpy_async (
@@ -131,18 +175,88 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
131175 executorch::backends::aoti::slim::CPU_DEVICE ,
132176 slim_tensor->device (),
133177 stream);
178+ } else {
179+ executorch::backends::aoti::slim::DeviceTraits<
180+ executorch::backends::aoti::slim::c10::DeviceType::CUDA >::
181+ memcpy (
182+ dst_data,
183+ src_data,
184+ nbytes,
185+ executorch::backends::aoti::slim::CPU_DEVICE ,
186+ slim_tensor->device ());
187+ }
188+ } else {
189+ // Slow path: strides differ (e.g., AOTI delegate output layout differs
190+ // from .pte's dim_order). Copy to a temp CPU buffer, then rearrange
191+ // element-by-element to match the ETensor's expected layout.
192+ std::vector<char > tmp (nbytes);
193+ if (slim_tensor->is_cpu ()) {
194+ std::memcpy (tmp.data (), src_data, nbytes);
195+ } else {
196+ if (stream) {
197+ ET_CUDA_CHECK_OR_RETURN_ERROR (cudaStreamSynchronize (stream));
198+ }
199+ ET_CUDA_CHECK_OR_RETURN_ERROR (
200+ cudaMemcpy (tmp.data (), src_data, nbytes, cudaMemcpyDeviceToHost));
134201 }
202+
203+ const size_t ndim = slim_tensor->dim ();
204+ auto slim_sizes = slim_tensor->sizes ();
205+ auto slim_strides = slim_tensor->strides ();
206+ auto et_strides = etensor->strides ();
207+
208+ std::vector<int64_t > sizes_vec (ndim);
209+ std::vector<int64_t > src_strides_vec (ndim);
210+ std::vector<int32_t > dst_strides_vec (ndim);
211+ for (size_t i = 0 ; i < ndim; ++i) {
212+ sizes_vec[i] = slim_sizes[i];
213+ src_strides_vec[i] = slim_strides[i];
214+ dst_strides_vec[i] = et_strides[i];
215+ }
216+
217+ size_t elem_size = executorch::backends::aoti::slim::c10::elementSize (
218+ slim_tensor->dtype ());
219+ _strided_copy (
220+ dst_data,
221+ tmp.data (),
222+ elem_size,
223+ sizes_vec,
224+ src_strides_vec,
225+ dst_strides_vec);
135226 }
136227
137228 return executorch::runtime::Error::Ok;
138229}
230+ } // namespace
231+
232+ /* *
233+ * Copies data from a SlimTensor to an ETensor asynchronously.
234+ *
235+ * When strides match (common case), performs a fast async GPU-to-CPU copy on
236+ * the provided stream. When strides differ (e.g., AOTI delegate output layout
237+ * differs from the .pte's dim_order), falls back to a synchronous copy with
238+ * element-by-element rearrangement.
239+ *
240+ * NOTE: In the fast path the copy is asynchronous. The caller must synchronize
241+ * the stream before reading the ETensor data on the CPU side.
242+ *
243+ * @param slim_tensor Pointer to the source SlimTensor (must not be null).
244+ * @param etensor Pointer to the destination ETensor (must not be null).
245+ * @param stream The CUDA stream to use for async copy.
246+ * @return Error::Ok on success, or an appropriate error code on failure.
247+ */
248+ inline executorch::runtime::Error copy_slimtensor_to_etensor_async (
249+ const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
250+ executorch::runtime::etensor::Tensor* etensor,
251+ cudaStream_t stream) {
252+ return _copy_slimtensor_to_etensor_impl (slim_tensor, etensor, stream);
253+ }
139254
140255/* *
141256 * Copies data from a SlimTensor to an ETensor synchronously.
142257 *
143- * This function converts a SlimTensor back to an ETensor. The ETensor is
144- * assumed to always reside on CPU, so this handles both CPU→CPU and GPU→CPU
145- * copies. The function will resize the ETensor if needed and copy the data.
258+ * Handles stride mismatches between the delegate output and the .pte's
259+ * expected layout by rearranging data element-by-element when needed.
146260 *
147261 * @param slim_tensor Pointer to the source SlimTensor (must not be null).
148262 * @param etensor Pointer to the destination ETensor (must not be null).
@@ -151,32 +265,7 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
151265inline executorch::runtime::Error copy_slimtensor_to_etensor (
152266 const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
153267 executorch::runtime::etensor::Tensor* etensor) {
154- _check_tensor_metadata (slim_tensor, etensor);
155-
156- // Copy data from SlimTensor to ETensor
157- // SlimTensor may be on GPU or CPU, ETensor is always on CPU
158- size_t nbytes = slim_tensor->nbytes ();
159- if (nbytes > 0 ) {
160- void * dst_data = etensor->mutable_data_ptr ();
161- const void * src_data = slim_tensor->data_ptr ();
162-
163- if (slim_tensor->is_cpu ()) {
164- // CPU → CPU copy
165- std::memcpy (dst_data, src_data, nbytes);
166- } else {
167- // GPU → CPU synchronous copy
168- executorch::backends::aoti::slim::DeviceTraits<
169- executorch::backends::aoti::slim::c10::DeviceType::CUDA >::
170- memcpy (
171- dst_data,
172- src_data,
173- nbytes,
174- executorch::backends::aoti::slim::CPU_DEVICE ,
175- slim_tensor->device ());
176- }
177- }
178-
179- return executorch::runtime::Error::Ok;
268+ return _copy_slimtensor_to_etensor_impl (slim_tensor, etensor, nullptr );
180269}
181270
182271/* *
@@ -197,7 +286,7 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor(
197286inline executorch::runtime::Error wrap_slimtensor_to_etensor (
198287 const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
199288 executorch::runtime::etensor::Tensor* etensor) {
200- _check_tensor_metadata (slim_tensor, etensor);
289+ ET_CHECK_OK_OR_RETURN_ERROR ( _check_tensor_metadata (slim_tensor, etensor) );
201290
202291 // Set data pointer to point directly to SlimTensor's data (zero-copy)
203292 etensor->unsafeGetTensorImpl ()->set_data (
0 commit comments