Skip to content

Commit eb77ed4

Browse files
authored
CUDA backend: fix output stride mismatch in delegate copy-back (#17945)
The .pte serializes the output dim_order from PyTorch's SDPA composite (which may return a non-contiguous transposed view, e.g., efficient attention outputs [B,Lq,H,D] transposed to [B,H,Lq,D]). However, the AOTI delegate always produces contiguous output in its own layout, ignoring the .pte's expected dim_order. The runtime byte-copies the GPU data to the CPU ETensor, but the ETensor interprets it with the .pte's strides — causing silent data corruption when the layouts differ. Fix: in copy_slimtensor_to_etensor, detect when the SlimTensor (GPU) and ETensor (CPU) have different strides. When they match (common case), use the fast byte-copy path. When they differ, copy GPU data to a temp CPU buffer then rearrange element-by-element to match the ETensor's expected layout. Also enables the accuracy check in test_non_pow2_head_dim_with_bool_mask and adds test_output_stride_rearrange.py exercising both fast and slow copy paths with Triton ON and OFF.
1 parent 38e83d1 commit eb77ed4

3 files changed

Lines changed: 327 additions & 63 deletions

File tree

backends/cuda/runtime/utils.h

Lines changed: 146 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010

1111
#include <cuda_runtime.h>
12+
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
1213
#include <executorch/runtime/core/error.h>
1314
#include <executorch/runtime/core/exec_aten/exec_aten.h>
1415
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -86,42 +87,85 @@ inline executorch::runtime::Error _check_tensor_metadata(
8687

8788
return executorch::runtime::Error::Ok;
8889
}
89-
} // namespace
90+
// Check if src and dst strides match (same layout, no rearrangement needed).
91+
inline bool _strides_match(
92+
const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
93+
const executorch::runtime::etensor::Tensor* etensor) {
94+
const size_t ndim = slim_tensor->dim();
95+
auto slim_strides = slim_tensor->strides();
96+
auto et_strides = etensor->strides();
97+
for (size_t i = 0; i < ndim; ++i) {
98+
if (slim_strides[i] != static_cast<int64_t>(et_strides[i])) {
99+
return false;
100+
}
101+
}
102+
return true;
103+
}
90104

91-
/**
92-
* Copies data from a SlimTensor to an ETensor asynchronously.
93-
*
94-
* This function converts a SlimTensor back to an ETensor using async copy.
95-
* The ETensor is assumed to always reside on CPU, so this handles both
96-
* CPU→CPU and GPU→CPU copies. The function will resize the ETensor if needed
97-
* and copy the data asynchronously on the provided CUDA stream.
98-
*
99-
* NOTE: The caller must ensure proper synchronization after calling this
100-
* function if the ETensor data is accessed on the CPU side.
101-
*
102-
* @param slim_tensor Pointer to the source SlimTensor (must not be null).
103-
* @param etensor Pointer to the destination ETensor (must not be null).
104-
* @param stream The CUDA stream to use for async copy.
105-
* @return Error::Ok on success, or an appropriate error code on failure.
106-
*/
107-
inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
105+
// Element-by-element copy from a contiguous CPU buffer (src_strides layout)
106+
// to an ETensor (dst_strides layout), rearranging data to match.
107+
inline void _strided_copy(
108+
void* dst,
109+
const void* src,
110+
size_t elem_size,
111+
const std::vector<int64_t>& sizes,
112+
const std::vector<int64_t>& src_strides,
113+
const std::vector<int32_t>& dst_strides) {
114+
const size_t ndim = sizes.size();
115+
const size_t numel = [&]() {
116+
size_t n = 1;
117+
for (auto s : sizes)
118+
n *= static_cast<size_t>(s);
119+
return n;
120+
}();
121+
122+
// Iterate over all elements using N-dimensional index
123+
std::vector<int64_t> idx(ndim, 0);
124+
auto* dst_bytes = static_cast<char*>(dst);
125+
auto* src_bytes = static_cast<const char*>(src);
126+
127+
for (size_t i = 0; i < numel; ++i) {
128+
// Compute source and destination byte offsets
129+
size_t src_offset = 0, dst_offset = 0;
130+
for (size_t d = 0; d < ndim; ++d) {
131+
src_offset += static_cast<size_t>(idx[d]) *
132+
static_cast<size_t>(src_strides[d]) * elem_size;
133+
dst_offset += static_cast<size_t>(idx[d]) *
134+
static_cast<size_t>(dst_strides[d]) * elem_size;
135+
}
136+
std::memcpy(dst_bytes + dst_offset, src_bytes + src_offset, elem_size);
137+
138+
// Increment N-dimensional index (last dimension fastest)
139+
for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
140+
if (++idx[d] < sizes[d])
141+
break;
142+
idx[d] = 0;
143+
}
144+
}
145+
}
146+
147+
// Copy data from SlimTensor to ETensor, rearranging if strides differ.
148+
// When stream is non-null, GPU copies use that stream (async fast path).
149+
// When stream is null, GPU copies are synchronous.
150+
inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl(
108151
const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
109152
executorch::runtime::etensor::Tensor* etensor,
110153
cudaStream_t stream) {
111-
_check_tensor_metadata(slim_tensor, etensor);
154+
ET_CHECK_OK_OR_RETURN_ERROR(_check_tensor_metadata(slim_tensor, etensor));
112155

113-
// Copy data from SlimTensor to ETensor
114-
// SlimTensor may be on GPU or CPU, ETensor is always on CPU
115156
size_t nbytes = slim_tensor->nbytes();
116-
if (nbytes > 0) {
117-
void* dst_data = etensor->mutable_data_ptr();
118-
const void* src_data = slim_tensor->data_ptr();
157+
if (nbytes == 0) {
158+
return executorch::runtime::Error::Ok;
159+
}
160+
161+
void* dst_data = etensor->mutable_data_ptr();
162+
const void* src_data = slim_tensor->data_ptr();
119163

164+
if (_strides_match(slim_tensor, etensor)) {
165+
// Fast path: strides match, raw byte copy
120166
if (slim_tensor->is_cpu()) {
121-
// CPU → CPU copy (always synchronous)
122167
std::memcpy(dst_data, src_data, nbytes);
123-
} else {
124-
// GPU → CPU async copy
168+
} else if (stream) {
125169
executorch::backends::aoti::slim::DeviceTraits<
126170
executorch::backends::aoti::slim::c10::DeviceType::CUDA>::
127171
memcpy_async(
@@ -131,18 +175,88 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
131175
executorch::backends::aoti::slim::CPU_DEVICE,
132176
slim_tensor->device(),
133177
stream);
178+
} else {
179+
executorch::backends::aoti::slim::DeviceTraits<
180+
executorch::backends::aoti::slim::c10::DeviceType::CUDA>::
181+
memcpy(
182+
dst_data,
183+
src_data,
184+
nbytes,
185+
executorch::backends::aoti::slim::CPU_DEVICE,
186+
slim_tensor->device());
187+
}
188+
} else {
189+
// Slow path: strides differ (e.g., AOTI delegate output layout differs
190+
// from .pte's dim_order). Copy to a temp CPU buffer, then rearrange
191+
// element-by-element to match the ETensor's expected layout.
192+
std::vector<char> tmp(nbytes);
193+
if (slim_tensor->is_cpu()) {
194+
std::memcpy(tmp.data(), src_data, nbytes);
195+
} else {
196+
if (stream) {
197+
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamSynchronize(stream));
198+
}
199+
ET_CUDA_CHECK_OR_RETURN_ERROR(
200+
cudaMemcpy(tmp.data(), src_data, nbytes, cudaMemcpyDeviceToHost));
134201
}
202+
203+
const size_t ndim = slim_tensor->dim();
204+
auto slim_sizes = slim_tensor->sizes();
205+
auto slim_strides = slim_tensor->strides();
206+
auto et_strides = etensor->strides();
207+
208+
std::vector<int64_t> sizes_vec(ndim);
209+
std::vector<int64_t> src_strides_vec(ndim);
210+
std::vector<int32_t> dst_strides_vec(ndim);
211+
for (size_t i = 0; i < ndim; ++i) {
212+
sizes_vec[i] = slim_sizes[i];
213+
src_strides_vec[i] = slim_strides[i];
214+
dst_strides_vec[i] = et_strides[i];
215+
}
216+
217+
size_t elem_size = executorch::backends::aoti::slim::c10::elementSize(
218+
slim_tensor->dtype());
219+
_strided_copy(
220+
dst_data,
221+
tmp.data(),
222+
elem_size,
223+
sizes_vec,
224+
src_strides_vec,
225+
dst_strides_vec);
135226
}
136227

137228
return executorch::runtime::Error::Ok;
138229
}
230+
} // namespace
231+
232+
/**
233+
* Copies data from a SlimTensor to an ETensor asynchronously.
234+
*
235+
* When strides match (common case), performs a fast async GPU-to-CPU copy on
236+
* the provided stream. When strides differ (e.g., AOTI delegate output layout
237+
* differs from the .pte's dim_order), falls back to a synchronous copy with
238+
* element-by-element rearrangement.
239+
*
240+
* NOTE: In the fast path the copy is asynchronous. The caller must synchronize
241+
* the stream before reading the ETensor data on the CPU side.
242+
*
243+
* @param slim_tensor Pointer to the source SlimTensor (must not be null).
244+
* @param etensor Pointer to the destination ETensor (must not be null).
245+
* @param stream The CUDA stream to use for async copy.
246+
* @return Error::Ok on success, or an appropriate error code on failure.
247+
*/
248+
inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
249+
const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
250+
executorch::runtime::etensor::Tensor* etensor,
251+
cudaStream_t stream) {
252+
return _copy_slimtensor_to_etensor_impl(slim_tensor, etensor, stream);
253+
}
139254

140255
/**
141256
* Copies data from a SlimTensor to an ETensor synchronously.
142257
*
143-
* This function converts a SlimTensor back to an ETensor. The ETensor is
144-
* assumed to always reside on CPU, so this handles both CPU→CPU and GPU→CPU
145-
* copies. The function will resize the ETensor if needed and copy the data.
258+
* Handles stride mismatches between the delegate output and the .pte's
259+
* expected layout by rearranging data element-by-element when needed.
146260
*
147261
* @param slim_tensor Pointer to the source SlimTensor (must not be null).
148262
* @param etensor Pointer to the destination ETensor (must not be null).
@@ -151,32 +265,7 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
151265
inline executorch::runtime::Error copy_slimtensor_to_etensor(
152266
const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
153267
executorch::runtime::etensor::Tensor* etensor) {
154-
_check_tensor_metadata(slim_tensor, etensor);
155-
156-
// Copy data from SlimTensor to ETensor
157-
// SlimTensor may be on GPU or CPU, ETensor is always on CPU
158-
size_t nbytes = slim_tensor->nbytes();
159-
if (nbytes > 0) {
160-
void* dst_data = etensor->mutable_data_ptr();
161-
const void* src_data = slim_tensor->data_ptr();
162-
163-
if (slim_tensor->is_cpu()) {
164-
// CPU → CPU copy
165-
std::memcpy(dst_data, src_data, nbytes);
166-
} else {
167-
// GPU → CPU synchronous copy
168-
executorch::backends::aoti::slim::DeviceTraits<
169-
executorch::backends::aoti::slim::c10::DeviceType::CUDA>::
170-
memcpy(
171-
dst_data,
172-
src_data,
173-
nbytes,
174-
executorch::backends::aoti::slim::CPU_DEVICE,
175-
slim_tensor->device());
176-
}
177-
}
178-
179-
return executorch::runtime::Error::Ok;
268+
return _copy_slimtensor_to_etensor_impl(slim_tensor, etensor, nullptr);
180269
}
181270

182271
/**
@@ -197,7 +286,7 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor(
197286
inline executorch::runtime::Error wrap_slimtensor_to_etensor(
198287
const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
199288
executorch::runtime::etensor::Tensor* etensor) {
200-
_check_tensor_metadata(slim_tensor, etensor);
289+
ET_CHECK_OK_OR_RETURN_ERROR(_check_tensor_metadata(slim_tensor, etensor));
201290

202291
// Set data pointer to point directly to SlimTensor's data (zero-copy)
203292
etensor->unsafeGetTensorImpl()->set_data(

0 commit comments

Comments
 (0)