@@ -367,6 +367,82 @@ AOTITorchError aoti_torch_copy_(
367367 return Error::Ok;
368368}
369369
370+ // Check if a strided view is densely packed (no holes in memory).
371+ // A densely packed tensor's storage extent equals its numel.
372+ static bool is_packed_strides (
373+ const std::vector<aten::SizesType>& sizes,
374+ const std::vector<aten::StridesType>& strides) {
375+ int64_t ndim = static_cast <int64_t >(sizes.size ());
376+ if (ndim == 0 )
377+ return true ;
378+
379+ // Compute numel
380+ int64_t numel = 1 ;
381+ for (int64_t i = 0 ; i < ndim; i++) {
382+ numel *= sizes[i];
383+ }
384+ if (numel <= 1 )
385+ return true ;
386+
387+ // Compute storage extent: max offset + 1
388+ int64_t max_offset = 0 ;
389+ for (int64_t i = 0 ; i < ndim; i++) {
390+ if (sizes[i] > 1 ) {
391+ max_offset += (sizes[i] - 1 ) * strides[i];
392+ }
393+ }
394+ return (max_offset + 1 ) == numel;
395+ }
396+
397+ // Materialize a non-packed strided view into a new contiguous Metal buffer.
398+ // Copies elements from source using strided access. The caller must free the
399+ // returned buffer. On failure returns nullptr.
400+ static void * materialize_packed (
401+ void * src,
402+ const std::vector<aten::SizesType>& sizes,
403+ const std::vector<aten::StridesType>& strides,
404+ size_t element_size) {
405+ int64_t ndim = static_cast <int64_t >(sizes.size ());
406+ int64_t numel = 1 ;
407+ for (int64_t i = 0 ; i < ndim; i++) {
408+ numel *= sizes[i];
409+ }
410+
411+ void * dst = metal_allocate_buffer (numel * element_size);
412+ if (!dst)
413+ return nullptr ;
414+
415+ // Ensure pending GPU writes to the source buffer are complete
416+ auto * stream = getCurrentMetalStream ();
417+ if (stream) {
418+ stream->synchronize (SyncType::COMMIT_AND_WAIT);
419+ }
420+
421+ // Element-by-element strided copy
422+ char * src_bytes = static_cast <char *>(src);
423+ char * dst_bytes = static_cast <char *>(dst);
424+ std::vector<int64_t > coord (ndim, 0 );
425+ for (int64_t flat = 0 ; flat < numel; flat++) {
426+ // Compute source offset from strides
427+ int64_t src_offset = 0 ;
428+ for (int64_t d = 0 ; d < ndim; d++) {
429+ src_offset += coord[d] * strides[d];
430+ }
431+ std::memcpy (
432+ dst_bytes + flat * element_size,
433+ src_bytes + src_offset * element_size,
434+ element_size);
435+
436+ // Increment coordinate (last dim fastest)
437+ for (int64_t d = ndim - 1 ; d >= 0 ; d--) {
438+ if (++coord[d] < sizes[d])
439+ break ;
440+ coord[d] = 0 ;
441+ }
442+ }
443+ return dst;
444+ }
445+
370446AOTITorchError aoti_torch__reinterpret_tensor (
371447 AOTITensorHandle self,
372448 int64_t ndim,
@@ -430,8 +506,9 @@ AOTITorchError aoti_torch__reinterpret_tensor(
430506 data_ptr);
431507
432508 // Handle storage offset by adjusting the data pointer
433- void * adjusted_data = static_cast <char *>(data_ptr) +
434- (storage_offset * dtype_to_element_size (dtype));
509+ size_t element_size = dtype_to_element_size (dtype);
510+ void * adjusted_data =
511+ static_cast <char *>(data_ptr) + (storage_offset * element_size);
435512
436513 // Convert sizes using utility function from utils.h
437514 std::vector<aten::SizesType> sizes = convert_sizes_to_vector (ndim, sizes_ptr);
@@ -440,14 +517,35 @@ AOTITorchError aoti_torch__reinterpret_tensor(
440517 std::vector<aten::StridesType> strides =
441518 convert_strides_to_vector (ndim, sizes_ptr, strides_ptr);
442519
443- // Create new tensor view that reinterprets the same memory with different
444- // shape/strides This creates a view, not a copy - the data pointer is shared
520+ // If the view is not densely packed (e.g. chunk/split creating holes),
521+ // materialize it into a new contiguous buffer.
522+ void * tensor_data = adjusted_data;
523+ bool owns_buffer = false ;
524+ if (!is_packed_strides (sizes, strides)) {
525+ ET_LOG (
526+ Debug,
527+ " aoti_torch__reinterpret_tensor: non-packed strides, "
528+ " materializing to packed buffer" );
529+ tensor_data =
530+ materialize_packed (adjusted_data, sizes, strides, element_size);
531+ ET_CHECK_OR_RETURN_ERROR (
532+ tensor_data != nullptr ,
533+ MemoryAllocationFailed,
534+ " Failed to materialize non-packed tensor" );
535+ owns_buffer = true ;
536+
537+ // Compute contiguous strides for the packed buffer
538+ strides.resize (ndim);
539+ if (ndim > 0 ) {
540+ strides[ndim - 1 ] = 1 ;
541+ for (int64_t i = ndim - 2 ; i >= 0 ; i--) {
542+ strides[i] = strides[i + 1 ] * sizes[i + 1 ];
543+ }
544+ }
545+ }
546+
445547 std::shared_ptr<Tensor> tensor = executorch::extension::from_blob (
446- adjusted_data, // Use adjusted data pointer with storage offset applied
447- sizes, // New sizes with explicit SizesType
448- strides, // New strides with explicit StridesType
449- dtype_to_scalar_type (dtype) // Convert dtype with explicit type casting
450- );
548+ tensor_data, sizes, strides, dtype_to_scalar_type (dtype));
451549
452550 ET_CHECK_OR_RETURN_ERROR (
453551 tensor != nullptr ,
@@ -456,32 +554,36 @@ AOTITorchError aoti_torch__reinterpret_tensor(
456554
457555 // Store the tensor so it doesn't get destroyed
458556 tensors[tensor.get ()] = tensor;
459-
460557 *ret_new_tensor = tensor.get ();
461558
462- if (adjusted_data != data_ptr) {
463- ET_LOG (
464- Debug,
465- " aoti_torch__reinterpret_tensor: Adjusted original_data=%p, storage_offset=%lld, element_size=%zu, adjusted_data=%p" ,
466- data_ptr,
467- storage_offset,
468- dtype_to_element_size (dtype),
469- adjusted_data);
470-
471- ET_CHECK_OR_RETURN_ERROR (
472- metal_buffer_nocopy (adjusted_data, tensor->nbytes (), true ),
473- Internal,
474- " metal_buffer_nocopy failed for adjusted_data=%p, nbytes=%zu" ,
475- adjusted_data,
476- static_cast <size_t >(tensor->nbytes ()));
477-
478- memory_to_n_tensor[adjusted_data] = NOT_OWN;
479- }
559+ if (owns_buffer) {
560+ // The materialized buffer is a new allocation owned by this tensor
561+ memory_to_n_tensor[tensor_data] = 1 ;
562+ } else {
563+ if (adjusted_data != data_ptr) {
564+ ET_LOG (
565+ Debug,
566+ " aoti_torch__reinterpret_tensor: Adjusted original_data=%p, "
567+ " storage_offset=%lld, element_size=%zu, adjusted_data=%p" ,
568+ data_ptr,
569+ storage_offset,
570+ element_size,
571+ adjusted_data);
572+
573+ ET_CHECK_OR_RETURN_ERROR (
574+ metal_buffer_nocopy (adjusted_data, tensor->nbytes (), true ),
575+ Internal,
576+ " metal_buffer_nocopy failed for adjusted_data=%p, nbytes=%zu" ,
577+ adjusted_data,
578+ static_cast <size_t >(tensor->nbytes ()));
579+
580+ memory_to_n_tensor[adjusted_data] = NOT_OWN;
581+ }
480582
481- // Increment the reference count for this memory address only if it is owned
482- // by tensor
483- if ( memory_to_n_tensor[data_ptr] != NOT_OWN) {
484- memory_to_n_tensor[data_ptr] += 1 ;
583+ // Increment the reference count for this memory address only if it is owned
584+ if (memory_to_n_tensor[data_ptr] != NOT_OWN) {
585+ memory_to_n_tensor[data_ptr] += 1 ;
586+ }
485587 }
486588
487589 ET_LOG (Debug, " aoti_torch__reinterpret_tensor: successful" );
0 commit comments