Skip to content

Commit e0dfec5

Browse files
authored
make device copy operator dynamic shape support
Differential Revision: D107901331 Pull Request resolved: pytorch#20116
1 parent ed9ffa5 commit e0dfec5

2 files changed

Lines changed: 251 additions & 8 deletions

File tree

kernels/portable/cpu/op__device_copy.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,15 @@ _h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
5656
out,
5757
"_h2d_copy: destination tensor must be on a non-CPU device");
5858

59-
auto nbytes = self.nbytes();
6059
ET_KERNEL_CHECK_MSG(
6160
ctx,
62-
nbytes == out.nbytes(),
61+
resize_tensor(out, self.sizes()) == Error::Ok,
6362
InvalidArgument,
6463
out,
65-
"_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
66-
nbytes,
64+
"_h2d_copy: cannot resize out to self sizes (self.nbytes()=%zu exceeds out planned capacity %zu?)",
65+
self.nbytes(),
6766
out.nbytes());
67+
auto nbytes = self.nbytes();
6868

6969
DeviceAllocator* allocator =
7070
executorch::runtime::get_device_allocator(device_type);
@@ -117,15 +117,15 @@ _d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) {
117117
"_d2h_copy: destination tensor must be on CPU, got device_type=%d",
118118
static_cast<int>(out.unsafeGetTensorImpl()->device_type()));
119119

120-
auto nbytes = self.nbytes();
121120
ET_KERNEL_CHECK_MSG(
122121
ctx,
123-
nbytes == out.nbytes(),
122+
resize_tensor(out, self.sizes()) == Error::Ok,
124123
InvalidArgument,
125124
out,
126-
"_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu",
127-
nbytes,
125+
"_d2h_copy: cannot resize out to self sizes (self.nbytes()=%zu exceeds out planned capacity %zu?)",
126+
self.nbytes(),
128127
out.nbytes());
128+
auto nbytes = self.nbytes();
129129

130130
DeviceAllocator* allocator =
131131
executorch::runtime::get_device_allocator(device_type);

kernels/test/op__device_copy_test.cpp

Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,3 +246,246 @@ TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) {
246246
EXPECT_EQ(dst_data[i], src_data[i]);
247247
}
248248
}
249+
250+
// H2D: out has a LARGER upper-bound capacity + dynamic shape, self is SMALLER.
251+
// After the op, out is resized down to self's shape and holds self's values.
252+
TEST_F(OpDeviceCopyTest, H2dCopyDynamicShapeResizesOutDownToInput) {
253+
// CPU source: actual (smaller) shape [4].
254+
float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
255+
int32_t src_sizes[] = {4};
256+
uint8_t src_dim_order[] = {0};
257+
int32_t src_strides[] = {1};
258+
TensorImpl src_impl(
259+
ScalarType::Float,
260+
1,
261+
src_sizes,
262+
src_data,
263+
src_dim_order,
264+
src_strides,
265+
TensorShapeDynamism::STATIC,
266+
DeviceType::CPU,
267+
0);
268+
Tensor src(&src_impl);
269+
270+
// CUDA destination: planned at upper bound [8] (capacity = 8 elems), dynamic.
271+
float dst_data[] = {0, 0, 0, 0, 0, 0, 0, 0};
272+
int32_t dst_sizes[] = {8};
273+
uint8_t dst_dim_order[] = {0};
274+
int32_t dst_strides[] = {1};
275+
TensorImpl dst_impl(
276+
ScalarType::Float,
277+
1,
278+
dst_sizes,
279+
dst_data,
280+
dst_dim_order,
281+
dst_strides,
282+
TensorShapeDynamism::DYNAMIC_BOUND,
283+
DeviceType::CUDA,
284+
0);
285+
Tensor dst(&dst_impl);
286+
287+
Tensor& result = op_h2d_copy_out(src, dst);
288+
289+
// out was resized down to match self.
290+
EXPECT_EQ(dst.dim(), 1);
291+
EXPECT_EQ(dst.size(0), 4);
292+
EXPECT_EQ(dst.numel(), 4);
293+
294+
// Only self.nbytes() worth of data was copied.
295+
EXPECT_EQ(g_mock_cuda.h2d_count_, 1);
296+
EXPECT_EQ(g_mock_cuda.last_h2d_size_, 4 * sizeof(float));
297+
298+
// out values equal self values.
299+
EXPECT_EQ(dst_data[0], 1.0f);
300+
EXPECT_EQ(dst_data[1], 2.0f);
301+
EXPECT_EQ(dst_data[2], 3.0f);
302+
EXPECT_EQ(dst_data[3], 4.0f);
303+
304+
EXPECT_EQ(&result, &dst);
305+
}
306+
307+
// D2H: mirror of the above, device -> host with a larger planned out buffer.
308+
TEST_F(OpDeviceCopyTest, D2hCopyDynamicShapeResizesOutDownToInput) {
309+
// CUDA source: actual (smaller) shape [4].
310+
float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f};
311+
int32_t src_sizes[] = {4};
312+
uint8_t src_dim_order[] = {0};
313+
int32_t src_strides[] = {1};
314+
TensorImpl src_impl(
315+
ScalarType::Float,
316+
1,
317+
src_sizes,
318+
src_data,
319+
src_dim_order,
320+
src_strides,
321+
TensorShapeDynamism::STATIC,
322+
DeviceType::CUDA,
323+
0);
324+
Tensor src(&src_impl);
325+
326+
// CPU destination: planned at upper bound [8] (capacity = 8 elems), dynamic.
327+
float dst_data[] = {0, 0, 0, 0, 0, 0, 0, 0};
328+
int32_t dst_sizes[] = {8};
329+
uint8_t dst_dim_order[] = {0};
330+
int32_t dst_strides[] = {1};
331+
TensorImpl dst_impl(
332+
ScalarType::Float,
333+
1,
334+
dst_sizes,
335+
dst_data,
336+
dst_dim_order,
337+
dst_strides,
338+
TensorShapeDynamism::DYNAMIC_BOUND,
339+
DeviceType::CPU,
340+
0);
341+
Tensor dst(&dst_impl);
342+
343+
Tensor& result = op_d2h_copy_out(src, dst);
344+
345+
EXPECT_EQ(dst.dim(), 1);
346+
EXPECT_EQ(dst.size(0), 4);
347+
EXPECT_EQ(dst.numel(), 4);
348+
349+
EXPECT_EQ(g_mock_cuda.d2h_count_, 1);
350+
EXPECT_EQ(g_mock_cuda.last_d2h_size_, 4 * sizeof(float));
351+
352+
EXPECT_EQ(dst_data[0], 5.0f);
353+
EXPECT_EQ(dst_data[1], 6.0f);
354+
EXPECT_EQ(dst_data[2], 7.0f);
355+
EXPECT_EQ(dst_data[3], 8.0f);
356+
357+
EXPECT_EQ(&result, &dst);
358+
}
359+
360+
// H2D: self LARGER than out's planned capacity -> resize fails -> op errors
361+
// with InvalidArgument and does NOT copy.
362+
TEST_F(OpDeviceCopyTest, H2dCopyFailsWhenInputExceedsOutCapacity) {
363+
// CPU source: shape [4].
364+
float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
365+
int32_t src_sizes[] = {4};
366+
uint8_t src_dim_order[] = {0};
367+
int32_t src_strides[] = {1};
368+
TensorImpl src_impl(
369+
ScalarType::Float,
370+
1,
371+
src_sizes,
372+
src_data,
373+
src_dim_order,
374+
src_strides,
375+
TensorShapeDynamism::STATIC,
376+
DeviceType::CPU,
377+
0);
378+
Tensor src(&src_impl);
379+
380+
// CUDA destination: planned capacity only [2], smaller than self.
381+
float dst_data[] = {0, 0};
382+
int32_t dst_sizes[] = {2};
383+
uint8_t dst_dim_order[] = {0};
384+
int32_t dst_strides[] = {1};
385+
TensorImpl dst_impl(
386+
ScalarType::Float,
387+
1,
388+
dst_sizes,
389+
dst_data,
390+
dst_dim_order,
391+
dst_strides,
392+
TensorShapeDynamism::DYNAMIC_BOUND,
393+
DeviceType::CUDA,
394+
0);
395+
Tensor dst(&dst_impl);
396+
397+
ET_EXPECT_KERNEL_FAILURE(context_, op_h2d_copy_out(src, dst));
398+
399+
#ifndef USE_ATEN_LIB
400+
EXPECT_EQ(context_.failure_state(), Error::InvalidArgument);
401+
#endif
402+
// The kernel bailed before copying.
403+
EXPECT_EQ(g_mock_cuda.h2d_count_, 0);
404+
}
405+
406+
// D2H: self LARGER than out's planned capacity -> resize fails -> op errors
407+
// with InvalidArgument and does NOT copy.
408+
TEST_F(OpDeviceCopyTest, D2hCopyFailsWhenInputExceedsOutCapacity) {
409+
// CUDA source: shape [4].
410+
float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f};
411+
int32_t src_sizes[] = {4};
412+
uint8_t src_dim_order[] = {0};
413+
int32_t src_strides[] = {1};
414+
TensorImpl src_impl(
415+
ScalarType::Float,
416+
1,
417+
src_sizes,
418+
src_data,
419+
src_dim_order,
420+
src_strides,
421+
TensorShapeDynamism::STATIC,
422+
DeviceType::CUDA,
423+
0);
424+
Tensor src(&src_impl);
425+
426+
// CPU destination: planned capacity only [2], smaller than self.
427+
float dst_data[] = {0, 0};
428+
int32_t dst_sizes[] = {2};
429+
uint8_t dst_dim_order[] = {0};
430+
int32_t dst_strides[] = {1};
431+
TensorImpl dst_impl(
432+
ScalarType::Float,
433+
1,
434+
dst_sizes,
435+
dst_data,
436+
dst_dim_order,
437+
dst_strides,
438+
TensorShapeDynamism::DYNAMIC_BOUND,
439+
DeviceType::CPU,
440+
0);
441+
Tensor dst(&dst_impl);
442+
443+
ET_EXPECT_KERNEL_FAILURE(context_, op_d2h_copy_out(src, dst));
444+
445+
#ifndef USE_ATEN_LIB
446+
EXPECT_EQ(context_.failure_state(), Error::InvalidArgument);
447+
#endif
448+
EXPECT_EQ(g_mock_cuda.d2h_count_, 0);
449+
}
450+
451+
// Equal-size case under the dynamic-bound path: capacity == input size still
452+
// copies correctly (confirms existing behavior is preserved by the resize).
453+
TEST_F(OpDeviceCopyTest, H2dCopyDynamicBoundEqualSizeStillCopies) {
454+
float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
455+
int32_t sizes[] = {4};
456+
uint8_t dim_order[] = {0};
457+
int32_t strides[] = {1};
458+
TensorImpl src_impl(
459+
ScalarType::Float,
460+
1,
461+
sizes,
462+
src_data,
463+
dim_order,
464+
strides,
465+
TensorShapeDynamism::STATIC,
466+
DeviceType::CPU,
467+
0);
468+
Tensor src(&src_impl);
469+
470+
float dst_data[] = {0, 0, 0, 0};
471+
TensorImpl dst_impl(
472+
ScalarType::Float,
473+
1,
474+
sizes,
475+
dst_data,
476+
dim_order,
477+
strides,
478+
TensorShapeDynamism::DYNAMIC_BOUND,
479+
DeviceType::CUDA,
480+
0);
481+
Tensor dst(&dst_impl);
482+
483+
op_h2d_copy_out(src, dst);
484+
485+
EXPECT_EQ(dst.size(0), 4);
486+
EXPECT_EQ(g_mock_cuda.h2d_count_, 1);
487+
EXPECT_EQ(g_mock_cuda.last_h2d_size_, 4 * sizeof(float));
488+
for (int i = 0; i < 4; ++i) {
489+
EXPECT_EQ(dst_data[i], src_data[i]);
490+
}
491+
}

0 commit comments

Comments
 (0)