@@ -112,132 +112,3 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
112112 im2col_cuda_f32 (src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
113113 }
114114}
115-
116- // [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
117- template <typename T>
118- static __global__ void im2col_3d_kernel (
119- const float * src, T * dst,
120- int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
121- int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
122- int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW,
123- int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW,
124- int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH,
125- int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) {
126- const int64_t i = threadIdx .x + blockIdx .x * blockDim .x ;
127- if (i >= IC_KD_KH_KW) {
128- return ;
129- }
130-
131- const int64_t iic = i / KD_KH_KW;
132- const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW;
133- const int64_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
134- const int64_t ikw = i % KW;
135-
136- const int64_t iow = blockIdx .y ;
137- for (int64_t iz = blockIdx .z ; iz < N_OD_OH; iz+=MAX_GRIDDIM_Z) {
138- const int64_t in = iz / OD_OH;
139- const int64_t iod = (iz - in*OD_OH) / OH;
140- const int64_t ioh = iz % OH;
141-
142- const int64_t iiw = iow * s0 + ikw * d0 - p0;
143- const int64_t iih = ioh * s1 + ikh * d1 - p1;
144- const int64_t iid = iod * s2 + ikd * d2 - p2;
145-
146- const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
147-
148- if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
149- dst[offset_dst] = 0 .0f ;
150- } else {
151- const int64_t offset_src = in*IC_ID_IH_IW + iic*ID_IH_IW + iid*IH_IW + iih*IW + iiw;
152- dst[offset_dst] = src[offset_src];
153- }
154- }
155- }
156-
157- // [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
158- template <typename T>
159- static void im2col_3d_cuda (const float * src, T* dst,
160- int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
161- int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
162- int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
163- const int64_t OH_OW = OH*OW;
164- const int64_t KD_KH_KW = KD*KH*KW;
165- const int64_t ID_IH_IW = ID*IH*IW;
166- const int64_t KH_KW = KH*KW;
167- const int64_t IH_IW = IH*IW;
168- const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
169- const int64_t OW_KD_KH_KW = OW*KD*KH*KW;
170- const int64_t N_OD_OH = N*OD*OH;
171- const int64_t OD_OH = OD*OH;
172- const int64_t IC_ID_IH_IW = IC*ID*IH*IW;
173- const int64_t OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW;
174- const int64_t OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
175- const int64_t OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
176- const int64_t num_blocks = (IC_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1 ) / CUDA_IM2COL_BLOCK_SIZE;
177- dim3 block_nums (num_blocks, OW, MIN (N_OD_OH, MAX_GRIDDIM_Z));
178- im2col_3d_kernel<<<block_nums, MIN(IC_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0 , stream>>> (src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
179- OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
180- IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
181- OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH,
182- s0, s1, s2, p0, p1, p2, d0, d1, d2);
183- }
184-
185- static void im2col_3d_cuda_f16 (const float * src, half * dst,
186- int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
187- int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
188- int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
189-
190- im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
191- }
192-
193- static void im2col_3d_cuda_f32 (const float * src, float * dst,
194- int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
195- int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
196- int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
197-
198- im2col_3d_cuda<float >(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
199- }
200-
201- void ggml_cuda_op_im2col_3d (ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
202- const ggml_tensor * src0 = dst->src [0 ];
203- const ggml_tensor * src1 = dst->src [1 ];
204- const float * src1_d = (const float *)src1->data ;
205- float * dst_d = (float *)dst->data ;
206- cudaStream_t stream = ctx.stream ();
207-
208- GGML_ASSERT (src1->type == GGML_TYPE_F32);
209- GGML_ASSERT ( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
210-
211- GGML_TENSOR_BINARY_OP_LOCALS
212-
213- const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
214- const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
215- const int32_t s2 = ((const int32_t *)(dst->op_params ))[2 ];
216- const int32_t p0 = ((const int32_t *)(dst->op_params ))[3 ];
217- const int32_t p1 = ((const int32_t *)(dst->op_params ))[4 ];
218- const int32_t p2 = ((const int32_t *)(dst->op_params ))[5 ];
219- const int32_t d0 = ((const int32_t *)(dst->op_params ))[6 ];
220- const int32_t d1 = ((const int32_t *)(dst->op_params ))[7 ];
221- const int32_t d2 = ((const int32_t *)(dst->op_params ))[8 ];
222- const int32_t IC = ((const int32_t *)(dst->op_params ))[9 ];
223-
224- const int64_t N = ne13 / IC;
225- const int64_t ID = ne12;
226- const int64_t IH = ne11;
227- const int64_t IW = ne10;
228-
229- const int64_t OC = ne03 / IC;
230- const int64_t KD = ne02;
231- const int64_t KH = ne01;
232- const int64_t KW = ne00;
233-
234- const int64_t OD = ne3 / N;
235- const int64_t OH = ne2;
236- const int64_t OW = ne1;
237-
238- if (dst->type == GGML_TYPE_F16) {
239- im2col_3d_cuda_f16 (src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
240- } else {
241- im2col_3d_cuda_f32 (src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW, s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
242- }
243- }
0 commit comments