File tree Expand file tree Collapse file tree
ggml/src/ggml-hexagon/htp Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -88,6 +88,29 @@ static void cpy_thread_sametype_reshape(struct htp_copy_context * ct, struct htp
8888 const uint32_t ir0 = dr * ith ;
8989 const uint32_t ir1 = (ir0 + dr ) < nr ? (ir0 + dr ) : nr ;
9090
91+ // Fast path: when both src0 and dst are contiguous in memory
92+ // Replace the element-by-element loop with a single bulk HVX copy per (i03, i02) slice.
93+ const bool src0_contig = (nb00 == ct -> src0_type_size ) &&
94+ (nb01 == ne00 * nb00 ) &&
95+ (nb02 == ne01 * nb01 ) &&
96+ (nb03 == ne02 * nb02 );
97+ const bool dst_contig = (nb0 == ct -> dst_type_size ) &&
98+ (nb1 == ne0 * nb0 ) &&
99+ (nb2 == ne1 * nb1 ) &&
100+ (nb3 == ne2 * nb2 );
101+
102+ if (src0_contig && dst_contig ) {
103+ for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
104+ for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
105+ uint8_t * src_ptr = (uint8_t * ) src0 -> data + i03 * nb03 + i02 * nb02 + ir0 * nb01 ;
106+ uint32_t flat = ((i03 * ne02 + i02 )* ne01 + ir0 ) * ne00 ;
107+ uint8_t * dst_ptr = (uint8_t * ) dst -> data + flat * ct -> src0_type_size ;
108+ hvx_copy_uu (dst_ptr , src_ptr , (ir1 - ir0 ) * ne00 , ct -> src0_type_size );
109+ }
110+ }
111+ return ;
112+ }
113+
91114 // dst counters
92115 int64_t k10 = 0 ;
93116 int64_t i11 = 0 ;
You can’t perform that action at this time.
0 commit comments