@@ -538,6 +538,8 @@ struct ggml_backend_opencl_context {
538538 cl_kernel kernel_restore_block_q4_0_noshuffle;
539539 cl_kernel kernel_convert_block_q4_1_noshuffle;
540540 cl_kernel kernel_restore_block_q4_1_noshuffle;
541+ cl_kernel kernel_convert_block_q4_K_noshuffle;
542+ cl_kernel kernel_restore_block_q4_K_noshuffle;
541543 cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
542544 cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
543545 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@@ -720,6 +722,8 @@ struct ggml_backend_opencl_context {
720722 cl_kernel kernel_gemm_noshuffle_q4_1_f32;
721723 cl_kernel kernel_mul_mm_q8_0_f32_8x4;
722724 cl_kernel CL_mul_mat_vec_q8_0_f32;
725+ cl_kernel kernel_gemv_noshuffle_q4_k_f32;
726+ cl_kernel kernel_gemm_noshuffle_q4_k_f32;
723727 cl_kernel kernel_gemv_noshuffle_q6_K_f32;
724728 cl_kernel kernel_gemm_noshuffle_q6_K_f32;
725729#endif // GGML_OPENCL_USE_ADRENO_KERNELS
@@ -932,6 +936,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
932936 CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
933937 CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err));
934938 CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
939+ CL_CHECK((backend_ctx->kernel_convert_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K_noshuffle", &err), err));
940+ CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
935941 CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
936942 CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
937943 CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
@@ -2619,6 +2625,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
26192625 GGML_LOG_CONT(".");
26202626 }
26212627
2628+ // gemm_noshuffle_q4_k_f32
2629+ {
2630+ #ifdef GGML_OPENCL_EMBED_KERNELS
2631+ const std::string kernel_src {
2632+ #include "gemm_noshuffle_q4_k_f32.cl.h"
2633+ };
2634+ #else
2635+ const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
2636+ #endif
2637+ cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2638+ CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_k_f32", &err), err));
2639+ CL_CHECK(clReleaseProgram(prog));
2640+ GGML_LOG_CONT(".");
2641+ }
2642+
2643+ // gemv_noshuffle_q4_k_f32
2644+ {
2645+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2646+ " -cl-mad-enable ";
2647+ if (backend_ctx->has_vector_subgroup_broadcast) {
2648+ CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
2649+ }
2650+
2651+ #ifdef GGML_OPENCL_EMBED_KERNELS
2652+ const std::string kernel_src {
2653+ #include "gemv_noshuffle_q4_k_f32.cl.h"
2654+ };
2655+ #else
2656+ const std::string kernel_src = read_file("gemv_noshuffle_q4_k_f32.cl");
2657+ #endif
2658+
2659+ cl_program prog = build_program_from_source(
2660+ backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
2661+
2662+ CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_k_f32", &err), err));
2663+ CL_CHECK(clReleaseProgram(prog));
2664+ GGML_LOG_CONT(".");
2665+ }
2666+
26222667 std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
26232668 " -cl-mad-enable "
26242669 " -cl-fast-relaxed-math";
@@ -5060,12 +5105,25 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
50605105 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
50615106 CL_CHECK(err);
50625107
5108+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
50635109 cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
5110+ if (use_adreno_kernels(backend_ctx, tensor)) {
5111+ kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
5112+ }
5113+ #else
5114+ cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
5115+ #endif
5116+
5117+ cl_uchar mask_0F = 0x0F;
5118+ cl_uchar mask_F0 = 0xF0;
5119+
50645120 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
50655121 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
50665122 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
50675123 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
50685124 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
5125+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5126+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
50695127
50705128 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
50715129 size_t local_work_size[] = {64, 1, 1};
@@ -5076,6 +5134,20 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
50765134 CL_CHECK(clReleaseMemObject(data_device));
50775135
50785136 tensor->extra = extra;
5137+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5138+ if (use_adreno_kernels(backend_ctx, tensor)) {
5139+
5140+ int M = tensor->ne[1];
5141+ int K = tensor->ne[0];
5142+
5143+ GGML_ASSERT(K % 32 == 0);
5144+
5145+ // Transpose q, d, dm as ushort
5146+ transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
5147+ transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M);
5148+ transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M);
5149+ }
5150+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
50795151 return;
50805152 }
50815153 if (tensor->type == GGML_TYPE_Q6_K) {
@@ -5516,12 +5588,60 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
55165588 ggml_nbytes(tensor), NULL, &err);
55175589 CL_CHECK(err);
55185590
5591+ cl_uchar mask_0F = 0x0F;
5592+ cl_uchar mask_F0 = 0xF0;
5593+
5594+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5595+ if (use_adreno_kernels(backend_ctx, tensor)) {
5596+ int M = tensor->ne[1];
5597+ int K = tensor->ne[0];
5598+
5599+ size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
5600+ size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
5601+ size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
5602+
5603+ static ggml_cl_buffer buf_trans_q;
5604+ static ggml_cl_buffer buf_trans_d;
5605+ static ggml_cl_buffer buf_trans_dm;
5606+
5607+ buf_trans_q.allocate(backend_ctx->context, size_q);
5608+ buf_trans_d.allocate(backend_ctx->context, size_d);
5609+ buf_trans_dm.allocate(backend_ctx->context, size_dm);
5610+
5611+ // Transpose q, d, dm back
5612+ transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
5613+ transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
5614+ transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256);
5615+
5616+ cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K_noshuffle;
5617+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
5618+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
5619+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
5620+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_dm.buffer));
5621+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
5622+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5623+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
5624+
5625+ size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
5626+ size_t local_work_size[] = {1, 1, 1};
5627+
5628+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5629+ global_work_size, local_work_size, 0, NULL, NULL));
5630+ CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset,
5631+ size, data, 0, NULL, NULL));
5632+ CL_CHECK(clReleaseMemObject(data_device));
5633+ return;
5634+ }
5635+ #endif // GGML_OPENCL_USE_ADRENO_KERNELS
5636+
55195637 cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K;
55205638 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
55215639 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
55225640 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
55235641 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
55245642 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
5643+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5644+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
55255645
55265646 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
55275647 size_t local_work_size[] = {1, 1, 1};
@@ -9688,6 +9808,192 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
96889808#endif
96899809}
96909810
9811+ static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9812+ #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9813+ GGML_ASSERT(src0);
9814+ GGML_ASSERT(src0->extra);
9815+ GGML_ASSERT(src1);
9816+ GGML_ASSERT(src1->extra);
9817+ GGML_ASSERT(dst);
9818+ GGML_ASSERT(dst->extra);
9819+
9820+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9821+
9822+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
9823+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
9824+ ggml_tensor_extra_cl_q4_K * extra0_q4_k = (ggml_tensor_extra_cl_q4_K *)src0->extra;
9825+
9826+ cl_ulong offset1 = extra1->offset + src1->view_offs;
9827+ cl_ulong offsetd = extrad->offset + dst->view_offs;
9828+
9829+ const int ne00 = src0->ne[0];
9830+ const int ne01 = src0->ne[1];
9831+
9832+ const int ne1 = dst->ne[1];
9833+
9834+ GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
9835+
9836+ cl_context context = backend_ctx->context;
9837+ cl_kernel kernel;
9838+
9839+ cl_int err;
9840+ cl_image_format img_fmt;
9841+ cl_image_desc img_desc;
9842+ cl_buffer_region region;
9843+
9844+ int M = ne01;
9845+ int N = ne1;
9846+ int K = ne00;
9847+
9848+ cl_uchar mask_d6 = 0x3F;
9849+ cl_uchar mask_d4 = 0x0F;
9850+ cl_uchar mask_hi2 = 0xC0;
9851+
9852+ if (ne1 == 1) {
9853+ cl_mem q_img = nullptr;
9854+ cl_mem b_sub_buf = nullptr;
9855+ cl_mem b_img = nullptr;
9856+
9857+ // image for q
9858+ img_fmt = { CL_R, CL_UNSIGNED_INT32};
9859+ memset(&img_desc, 0, sizeof(img_desc));
9860+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9861+ img_desc.image_width = M * K / 2 / 4;
9862+ img_desc.buffer = extra0_q4_k->q;
9863+ CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9864+
9865+ // subbuffer for activations
9866+ region.origin = offset1;
9867+ region.size = K * N * sizeof(float);
9868+ CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
9869+
9870+ // image for activations
9871+ img_fmt = {CL_RGBA, CL_FLOAT};
9872+ memset(&img_desc, 0, sizeof(img_desc));
9873+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9874+ img_desc.image_width = K * N / 4;
9875+ img_desc.buffer = b_sub_buf;
9876+ CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9877+
9878+ kernel = backend_ctx->kernel_gemv_noshuffle_q4_k_f32;
9879+
9880+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
9881+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->d));
9882+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->dm));
9883+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->s));
9884+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
9885+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
9886+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
9887+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
9888+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
9889+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_d6));
9890+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d4));
9891+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_hi2));
9892+
9893+ size_t local_work_size[3] = {64, 4, 1};
9894+ size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
9895+
9896+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9897+
9898+ CL_CHECK(clReleaseMemObject(q_img));
9899+ CL_CHECK(clReleaseMemObject(b_sub_buf));
9900+ CL_CHECK(clReleaseMemObject(b_img));
9901+ } else {
9902+
9903+ cl_mem b_sub_buf = nullptr;
9904+ cl_mem b_sub_buf_trans = nullptr;
9905+ cl_mem b_img = nullptr;
9906+ cl_mem b_img_trans = nullptr;
9907+
9908+ // subbuffer for activations
9909+ region.origin = offset1;
9910+ region.size = K * N * sizeof(float);
9911+ CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
9912+
9913+ // image for activations
9914+ img_fmt = {CL_RGBA, CL_FLOAT};
9915+ memset(&img_desc, 0, sizeof(img_desc));
9916+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9917+ img_desc.image_width = K * N / 4;
9918+ img_desc.buffer = b_sub_buf;
9919+ CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9920+
9921+ // pad N to multiple of 8
9922+ int extra_elements = N % 8;
9923+ int padding = 0;
9924+ if (extra_elements > 0){
9925+ padding = 8 - extra_elements;
9926+ }
9927+
9928+ // subbuffer for transposed activations
9929+ region.origin = 0;
9930+ region.size = K * (N + padding) * sizeof(float)/2;
9931+ backend_ctx->prealloc_act_trans.allocate(context, region.size);
9932+ CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
9933+
9934+ // image for transposed activations
9935+ img_fmt = {CL_RGBA, CL_HALF_FLOAT};
9936+ memset(&img_desc, 0, sizeof(img_desc));
9937+ img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9938+ img_desc.image_width = K * (N + padding) / 4;
9939+ img_desc.buffer = b_sub_buf_trans;
9940+ CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
9941+
9942+ // transpose activations
9943+ int height_B = N/4;
9944+ if (height_B == 0) {
9945+ height_B = 1;
9946+ }
9947+ int width_B = K/4;
9948+ int padded_height_B = (N + padding)/4;
9949+
9950+ kernel = backend_ctx->kernel_transpose_32_16;
9951+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
9952+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
9953+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
9954+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
9955+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
9956+
9957+ size_t local_work_size_t[2] = { 1, 16 };
9958+ size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
9959+ backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
9960+
9961+ // gemm
9962+ kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32;
9963+ int padded_N = N + padding;
9964+
9965+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_k->q));
9966+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->s));
9967+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->d));
9968+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->dm));
9969+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
9970+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
9971+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
9972+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
9973+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &padded_N));
9974+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne00));
9975+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne1));
9976+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d6));
9977+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d4));
9978+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_hi2));
9979+
9980+ size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
9981+ size_t local_work_size[3] = {1, 128, 1};
9982+
9983+ backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9984+ CL_CHECK(clReleaseMemObject(b_sub_buf));
9985+ CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
9986+ CL_CHECK(clReleaseMemObject(b_img));
9987+ CL_CHECK(clReleaseMemObject(b_img_trans));
9988+ }
9989+ #else
9990+ GGML_UNUSED(backend);
9991+ GGML_UNUSED(src0);
9992+ GGML_UNUSED(src1);
9993+ GGML_UNUSED(dst);
9994+ #endif
9995+ }
9996+
96919997static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
96929998#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
96939999 GGML_ASSERT(src0);
@@ -10014,6 +10320,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1001410320 return;
1001510321 }
1001610322
10323+ // q4_k x fp32
10324+ if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
10325+ ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
10326+ return;
10327+ }
10328+
1001710329 // q6_K x fp32
1001810330 if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
1001910331 ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
0 commit comments