@@ -246,3 +246,246 @@ TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) {
246246 EXPECT_EQ (dst_data[i], src_data[i]);
247247 }
248248}
249+
250+ // H2D: out has a LARGER upper-bound capacity + dynamic shape, self is SMALLER.
251+ // After the op, out is resized down to self's shape and holds self's values.
252+ TEST_F (OpDeviceCopyTest, H2dCopyDynamicShapeResizesOutDownToInput) {
253+ // CPU source: actual (smaller) shape [4].
254+ float src_data[] = {1 .0f , 2 .0f , 3 .0f , 4 .0f };
255+ int32_t src_sizes[] = {4 };
256+ uint8_t src_dim_order[] = {0 };
257+ int32_t src_strides[] = {1 };
258+ TensorImpl src_impl (
259+ ScalarType::Float,
260+ 1 ,
261+ src_sizes,
262+ src_data,
263+ src_dim_order,
264+ src_strides,
265+ TensorShapeDynamism::STATIC ,
266+ DeviceType::CPU ,
267+ 0 );
268+ Tensor src (&src_impl);
269+
270+ // CUDA destination: planned at upper bound [8] (capacity = 8 elems), dynamic.
271+ float dst_data[] = {0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
272+ int32_t dst_sizes[] = {8 };
273+ uint8_t dst_dim_order[] = {0 };
274+ int32_t dst_strides[] = {1 };
275+ TensorImpl dst_impl (
276+ ScalarType::Float,
277+ 1 ,
278+ dst_sizes,
279+ dst_data,
280+ dst_dim_order,
281+ dst_strides,
282+ TensorShapeDynamism::DYNAMIC_BOUND ,
283+ DeviceType::CUDA ,
284+ 0 );
285+ Tensor dst (&dst_impl);
286+
287+ Tensor& result = op_h2d_copy_out (src, dst);
288+
289+ // out was resized down to match self.
290+ EXPECT_EQ (dst.dim (), 1 );
291+ EXPECT_EQ (dst.size (0 ), 4 );
292+ EXPECT_EQ (dst.numel (), 4 );
293+
294+ // Only self.nbytes() worth of data was copied.
295+ EXPECT_EQ (g_mock_cuda.h2d_count_ , 1 );
296+ EXPECT_EQ (g_mock_cuda.last_h2d_size_ , 4 * sizeof (float ));
297+
298+ // out values equal self values.
299+ EXPECT_EQ (dst_data[0 ], 1 .0f );
300+ EXPECT_EQ (dst_data[1 ], 2 .0f );
301+ EXPECT_EQ (dst_data[2 ], 3 .0f );
302+ EXPECT_EQ (dst_data[3 ], 4 .0f );
303+
304+ EXPECT_EQ (&result, &dst);
305+ }
306+
307+ // D2H: mirror of the above, device -> host with a larger planned out buffer.
308+ TEST_F (OpDeviceCopyTest, D2hCopyDynamicShapeResizesOutDownToInput) {
309+ // CUDA source: actual (smaller) shape [4].
310+ float src_data[] = {5 .0f , 6 .0f , 7 .0f , 8 .0f };
311+ int32_t src_sizes[] = {4 };
312+ uint8_t src_dim_order[] = {0 };
313+ int32_t src_strides[] = {1 };
314+ TensorImpl src_impl (
315+ ScalarType::Float,
316+ 1 ,
317+ src_sizes,
318+ src_data,
319+ src_dim_order,
320+ src_strides,
321+ TensorShapeDynamism::STATIC ,
322+ DeviceType::CUDA ,
323+ 0 );
324+ Tensor src (&src_impl);
325+
326+ // CPU destination: planned at upper bound [8] (capacity = 8 elems), dynamic.
327+ float dst_data[] = {0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
328+ int32_t dst_sizes[] = {8 };
329+ uint8_t dst_dim_order[] = {0 };
330+ int32_t dst_strides[] = {1 };
331+ TensorImpl dst_impl (
332+ ScalarType::Float,
333+ 1 ,
334+ dst_sizes,
335+ dst_data,
336+ dst_dim_order,
337+ dst_strides,
338+ TensorShapeDynamism::DYNAMIC_BOUND ,
339+ DeviceType::CPU ,
340+ 0 );
341+ Tensor dst (&dst_impl);
342+
343+ Tensor& result = op_d2h_copy_out (src, dst);
344+
345+ EXPECT_EQ (dst.dim (), 1 );
346+ EXPECT_EQ (dst.size (0 ), 4 );
347+ EXPECT_EQ (dst.numel (), 4 );
348+
349+ EXPECT_EQ (g_mock_cuda.d2h_count_ , 1 );
350+ EXPECT_EQ (g_mock_cuda.last_d2h_size_ , 4 * sizeof (float ));
351+
352+ EXPECT_EQ (dst_data[0 ], 5 .0f );
353+ EXPECT_EQ (dst_data[1 ], 6 .0f );
354+ EXPECT_EQ (dst_data[2 ], 7 .0f );
355+ EXPECT_EQ (dst_data[3 ], 8 .0f );
356+
357+ EXPECT_EQ (&result, &dst);
358+ }
359+
360+ // H2D: self LARGER than out's planned capacity -> resize fails -> op errors
361+ // with InvalidArgument and does NOT copy.
362+ TEST_F (OpDeviceCopyTest, H2dCopyFailsWhenInputExceedsOutCapacity) {
363+ // CPU source: shape [4].
364+ float src_data[] = {1 .0f , 2 .0f , 3 .0f , 4 .0f };
365+ int32_t src_sizes[] = {4 };
366+ uint8_t src_dim_order[] = {0 };
367+ int32_t src_strides[] = {1 };
368+ TensorImpl src_impl (
369+ ScalarType::Float,
370+ 1 ,
371+ src_sizes,
372+ src_data,
373+ src_dim_order,
374+ src_strides,
375+ TensorShapeDynamism::STATIC ,
376+ DeviceType::CPU ,
377+ 0 );
378+ Tensor src (&src_impl);
379+
380+ // CUDA destination: planned capacity only [2], smaller than self.
381+ float dst_data[] = {0 , 0 };
382+ int32_t dst_sizes[] = {2 };
383+ uint8_t dst_dim_order[] = {0 };
384+ int32_t dst_strides[] = {1 };
385+ TensorImpl dst_impl (
386+ ScalarType::Float,
387+ 1 ,
388+ dst_sizes,
389+ dst_data,
390+ dst_dim_order,
391+ dst_strides,
392+ TensorShapeDynamism::DYNAMIC_BOUND ,
393+ DeviceType::CUDA ,
394+ 0 );
395+ Tensor dst (&dst_impl);
396+
397+ ET_EXPECT_KERNEL_FAILURE (context_, op_h2d_copy_out (src, dst));
398+
399+ #ifndef USE_ATEN_LIB
400+ EXPECT_EQ (context_.failure_state (), Error::InvalidArgument);
401+ #endif
402+ // The kernel bailed before copying.
403+ EXPECT_EQ (g_mock_cuda.h2d_count_ , 0 );
404+ }
405+
406+ // D2H: self LARGER than out's planned capacity -> resize fails -> op errors
407+ // with InvalidArgument and does NOT copy.
408+ TEST_F (OpDeviceCopyTest, D2hCopyFailsWhenInputExceedsOutCapacity) {
409+ // CUDA source: shape [4].
410+ float src_data[] = {5 .0f , 6 .0f , 7 .0f , 8 .0f };
411+ int32_t src_sizes[] = {4 };
412+ uint8_t src_dim_order[] = {0 };
413+ int32_t src_strides[] = {1 };
414+ TensorImpl src_impl (
415+ ScalarType::Float,
416+ 1 ,
417+ src_sizes,
418+ src_data,
419+ src_dim_order,
420+ src_strides,
421+ TensorShapeDynamism::STATIC ,
422+ DeviceType::CUDA ,
423+ 0 );
424+ Tensor src (&src_impl);
425+
426+ // CPU destination: planned capacity only [2], smaller than self.
427+ float dst_data[] = {0 , 0 };
428+ int32_t dst_sizes[] = {2 };
429+ uint8_t dst_dim_order[] = {0 };
430+ int32_t dst_strides[] = {1 };
431+ TensorImpl dst_impl (
432+ ScalarType::Float,
433+ 1 ,
434+ dst_sizes,
435+ dst_data,
436+ dst_dim_order,
437+ dst_strides,
438+ TensorShapeDynamism::DYNAMIC_BOUND ,
439+ DeviceType::CPU ,
440+ 0 );
441+ Tensor dst (&dst_impl);
442+
443+ ET_EXPECT_KERNEL_FAILURE (context_, op_d2h_copy_out (src, dst));
444+
445+ #ifndef USE_ATEN_LIB
446+ EXPECT_EQ (context_.failure_state (), Error::InvalidArgument);
447+ #endif
448+ EXPECT_EQ (g_mock_cuda.d2h_count_ , 0 );
449+ }
450+
451+ // Equal-size case under the dynamic-bound path: capacity == input size still
452+ // copies correctly (confirms existing behavior is preserved by the resize).
453+ TEST_F (OpDeviceCopyTest, H2dCopyDynamicBoundEqualSizeStillCopies) {
454+ float src_data[] = {1 .0f , 2 .0f , 3 .0f , 4 .0f };
455+ int32_t sizes[] = {4 };
456+ uint8_t dim_order[] = {0 };
457+ int32_t strides[] = {1 };
458+ TensorImpl src_impl (
459+ ScalarType::Float,
460+ 1 ,
461+ sizes,
462+ src_data,
463+ dim_order,
464+ strides,
465+ TensorShapeDynamism::STATIC ,
466+ DeviceType::CPU ,
467+ 0 );
468+ Tensor src (&src_impl);
469+
470+ float dst_data[] = {0 , 0 , 0 , 0 };
471+ TensorImpl dst_impl (
472+ ScalarType::Float,
473+ 1 ,
474+ sizes,
475+ dst_data,
476+ dim_order,
477+ strides,
478+ TensorShapeDynamism::DYNAMIC_BOUND ,
479+ DeviceType::CUDA ,
480+ 0 );
481+ Tensor dst (&dst_impl);
482+
483+ op_h2d_copy_out (src, dst);
484+
485+ EXPECT_EQ (dst.size (0 ), 4 );
486+ EXPECT_EQ (g_mock_cuda.h2d_count_ , 1 );
487+ EXPECT_EQ (g_mock_cuda.last_h2d_size_ , 4 * sizeof (float ));
488+ for (int i = 0 ; i < 4 ; ++i) {
489+ EXPECT_EQ (dst_data[i], src_data[i]);
490+ }
491+ }
0 commit comments