@@ -7798,6 +7798,209 @@ void ggml_compute_forward_im2col_back_f32(
77987798 }
77997799}
78007800
7801+
7802+ // ggml_compute_forward_im2col_3d_f16
7803+ // src0: kernel [OC*IC, KD, KH, KW]
7804+ // src1: image [N*IC, ID, IH, IW]
7805+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7806+ static void ggml_compute_forward_im2col_3d_f16 (
7807+ const ggml_compute_params * params,
7808+ ggml_tensor * dst) {
7809+
7810+ const ggml_tensor * src0 = dst->src [0 ];
7811+ const ggml_tensor * src1 = dst->src [1 ];
7812+
7813+ GGML_ASSERT (src0->type == GGML_TYPE_F16);
7814+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
7815+ GGML_ASSERT ( dst->type == GGML_TYPE_F16);
7816+
7817+ GGML_TENSOR_BINARY_OP_LOCALS;
7818+
7819+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
7820+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
7821+ const int32_t s2 = ((const int32_t *)(dst->op_params ))[2 ];
7822+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[3 ];
7823+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[4 ];
7824+ const int32_t p2 = ((const int32_t *)(dst->op_params ))[5 ];
7825+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[6 ];
7826+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[7 ];
7827+ const int32_t d2 = ((const int32_t *)(dst->op_params ))[8 ];
7828+ const int32_t IC = ((const int32_t *)(dst->op_params ))[9 ];
7829+
7830+
7831+ const int ith = params->ith ;
7832+ const int nth = params->nth ;
7833+
7834+ const int64_t N = ne13 / IC;
7835+ const int64_t ID = ne12;
7836+ const int64_t IH = ne11;
7837+ const int64_t IW = ne10;
7838+
7839+ const int64_t OC = ne03 / IC;
7840+ GGML_UNUSED (OC);
7841+ const int64_t KD = ne02;
7842+ const int64_t KH = ne01;
7843+ const int64_t KW = ne00;
7844+
7845+ const int64_t OD = ne3 / N;
7846+ const int64_t OH = ne2;
7847+ const int64_t OW = ne1;
7848+ const int64_t OH_OW = OH*OW;
7849+ const int64_t KD_KH_KW = KD*KH*KW;
7850+ const int64_t KH_KW = KH*KW;
7851+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7852+
7853+ GGML_ASSERT (nb10 == sizeof (float ));
7854+
7855+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7856+ {
7857+ ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data ;
7858+
7859+ for (int64_t in = 0 ; in < N; in++) {
7860+ for (int64_t iod = 0 ; iod < OD; iod++) {
7861+ for (int64_t ioh = 0 ; ioh < OH; ioh++) {
7862+ for (int64_t iow = 0 ; iow < OW; iow++) {
7863+ for (int64_t iic = ith; iic < IC; iic += nth) {
7864+
7865+ // micro kernel
7866+ ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7867+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7868+
7869+ for (int64_t ikd = 0 ; ikd < KD; ikd++) {
7870+ for (int64_t ikh = 0 ; ikh < KH; ikh++) {
7871+ for (int64_t ikw = 0 ; ikw < KW; ikw++) {
7872+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7873+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7874+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7875+
7876+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7877+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0 ;
7878+ } else {
7879+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7880+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16 (*s);
7881+ }
7882+ }
7883+ }
7884+ }
7885+ }
7886+ }
7887+ }
7888+ }
7889+ }
7890+ }
7891+ }
7892+
7893+ // ggml_compute_forward_im2col_3d_f32
7894+ // src0: kernel [OC*IC, KD, KH, KW]
7895+ // src1: image [N*IC, ID, IH, IW]
7896+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7897+ static void ggml_compute_forward_im2col_3d_f32 (
7898+ const ggml_compute_params * params,
7899+ ggml_tensor * dst) {
7900+
7901+ const ggml_tensor * src0 = dst->src [0 ];
7902+ const ggml_tensor * src1 = dst->src [1 ];
7903+
7904+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
7905+ GGML_ASSERT ( dst->type == GGML_TYPE_F32);
7906+
7907+ GGML_TENSOR_BINARY_OP_LOCALS;
7908+
7909+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
7910+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
7911+ const int32_t s2 = ((const int32_t *)(dst->op_params ))[2 ];
7912+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[3 ];
7913+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[4 ];
7914+ const int32_t p2 = ((const int32_t *)(dst->op_params ))[5 ];
7915+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[6 ];
7916+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[7 ];
7917+ const int32_t d2 = ((const int32_t *)(dst->op_params ))[8 ];
7918+ const int32_t IC = ((const int32_t *)(dst->op_params ))[9 ];
7919+
7920+
7921+ const int ith = params->ith ;
7922+ const int nth = params->nth ;
7923+
7924+ const int64_t N = ne13 / IC;
7925+ const int64_t ID = ne12;
7926+ const int64_t IH = ne11;
7927+ const int64_t IW = ne10;
7928+
7929+ const int64_t OC = ne03 / IC;
7930+ GGML_UNUSED (OC);
7931+ const int64_t KD = ne02;
7932+ const int64_t KH = ne01;
7933+ const int64_t KW = ne00;
7934+
7935+ const int64_t OD = ne3 / N;
7936+ const int64_t OH = ne2;
7937+ const int64_t OW = ne1;
7938+
7939+ const int64_t OH_OW = OH*OW;
7940+ const int64_t KD_KH_KW = KD*KH*KW;
7941+ const int64_t KH_KW = KH*KW;
7942+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7943+
7944+ GGML_ASSERT (nb10 == sizeof (float ));
7945+
7946+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7947+ {
7948+ float * const wdata = (float *) dst->data ;
7949+
7950+ for (int64_t in = 0 ; in < N; in++) {
7951+ for (int64_t iod = 0 ; iod < OD; iod++) {
7952+ for (int64_t ioh = 0 ; ioh < OH; ioh++) {
7953+ for (int64_t iow = 0 ; iow < OW; iow++) {
7954+ for (int64_t iic = ith; iic < IC; iic += nth) {
7955+
7956+ // micro kernel
7957+ float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7958+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7959+
7960+ for (int64_t ikd = 0 ; ikd < KD; ikd++) {
7961+ for (int64_t ikh = 0 ; ikh < KH; ikh++) {
7962+ for (int64_t ikw = 0 ; ikw < KW; ikw++) {
7963+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7964+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7965+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7966+
7967+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7968+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0 ;
7969+ } else {
7970+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7971+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
7972+ }
7973+ }
7974+ }
7975+ }
7976+ }
7977+ }
7978+ }
7979+ }
7980+ }
7981+ }
7982+ }
7983+
7984+
7985+ void ggml_compute_forward_im2col_3d (
7986+ const ggml_compute_params * params,
7987+ ggml_tensor * dst) {
7988+ switch (dst->type ) {
7989+ case GGML_TYPE_F16:
7990+ {
7991+ ggml_compute_forward_im2col_3d_f16 (params, dst);
7992+ } break ;
7993+ case GGML_TYPE_F32:
7994+ {
7995+ ggml_compute_forward_im2col_3d_f32 (params, dst);
7996+ } break ;
7997+ default :
7998+ {
7999+ GGML_ABORT (" fatal error" );
8000+ }
8001+ }
8002+ }
8003+
78018004static void ggml_call_mul_mat (ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
78028005 void * a, void * b, float * c) {
78038006 const ggml_type_traits * traits = ggml_get_type_traits (type);
@@ -8785,6 +8988,15 @@ static void ggml_compute_forward_pad_f32(
87858988 GGML_TENSOR_UNARY_OP_LOCALS
87868989
87878990 float * dst_ptr = (float *) dst->data ;
8991+ const int32_t lp0 = ggml_get_op_params_i32 (dst, 0 );
8992+ const int32_t rp0 = ggml_get_op_params_i32 (dst, 1 );
8993+ const int32_t lp1 = ggml_get_op_params_i32 (dst, 2 );
8994+ const int32_t rp1 = ggml_get_op_params_i32 (dst, 3 );
8995+ const int32_t lp2 = ggml_get_op_params_i32 (dst, 4 );
8996+ const int32_t rp2 = ggml_get_op_params_i32 (dst, 5 );
8997+ const int32_t lp3 = ggml_get_op_params_i32 (dst, 6 );
8998+ const int32_t rp3 = ggml_get_op_params_i32 (dst, 7 );
8999+
87889000
87899001 // TODO: optimize
87909002
@@ -8793,10 +9005,12 @@ static void ggml_compute_forward_pad_f32(
87939005 for (int64_t i0 = 0 ; i0 < ne0; ++i0) {
87949006 for (int64_t i3 = 0 ; i3 < ne3; ++i3) {
87959007 const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
8796-
8797- const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
8798-
8799- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
9008+ if ((i0 >= lp0 && i0 < ne0 - rp0) \
9009+ && (i1 >= lp1 && i1 < ne1 - rp1) \
9010+ && (i2 >= lp2 && i2 < ne2 - rp2) \
9011+ && (i3 >= lp3 && i3 < ne3 - rp3)) {
9012+ const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
9013+ const float * src_ptr = (const float *)((char *) src0->data + src_idx);
88009014 dst_ptr[dst_idx] = *src_ptr;
88019015 } else {
88029016 dst_ptr[dst_idx] = 0 ;
0 commit comments