Skip to content

Commit 08f2145

Browse files
shaofeiqilhez
andauthored
opencl: add q4_K gemm and gemv kernels for Adreno (ggml-org#20919)
* opencl: add q4_K gemm and gemv kernels for Adreno * opencl: fix whitespace * opencl: add workarounds for compiler bugs on older devices * opencl: handle fp16 denorm on X Elite * opencl: fix kernel build error * opencl: fix whitespace * opencl: make q4_K cvt kernels signature consistent --------- Co-authored-by: Li He <lih@qti.qualcomm.com>
1 parent 84ae843 commit 08f2145

5 files changed

Lines changed: 877 additions & 2 deletions

File tree

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ set(GGML_OPENCL_KERNELS
114114
gemv_noshuffle_q4_1_f32
115115
gemm_noshuffle_q4_1_f32
116116
gemv_noshuffle_general_q8_0_f32
117+
gemv_noshuffle_q4_k_f32
118+
gemm_noshuffle_q4_k_f32
117119
gemv_noshuffle_q6_k_f32
118120
gemm_noshuffle_q6_k_f32
119121
mul

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 312 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,8 @@ struct ggml_backend_opencl_context {
538538
cl_kernel kernel_restore_block_q4_0_noshuffle;
539539
cl_kernel kernel_convert_block_q4_1_noshuffle;
540540
cl_kernel kernel_restore_block_q4_1_noshuffle;
541+
cl_kernel kernel_convert_block_q4_K_noshuffle;
542+
cl_kernel kernel_restore_block_q4_K_noshuffle;
541543
cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K;
542544
cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
543545
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
@@ -720,6 +722,8 @@ struct ggml_backend_opencl_context {
720722
cl_kernel kernel_gemm_noshuffle_q4_1_f32;
721723
cl_kernel kernel_mul_mm_q8_0_f32_8x4;
722724
cl_kernel CL_mul_mat_vec_q8_0_f32;
725+
cl_kernel kernel_gemv_noshuffle_q4_k_f32;
726+
cl_kernel kernel_gemm_noshuffle_q4_k_f32;
723727
cl_kernel kernel_gemv_noshuffle_q6_K_f32;
724728
cl_kernel kernel_gemm_noshuffle_q6_K_f32;
725729
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
@@ -932,6 +936,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
932936
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
933937
CL_CHECK((backend_ctx->kernel_convert_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K", &err), err));
934938
CL_CHECK((backend_ctx->kernel_restore_block_q4_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K", &err), err));
939+
CL_CHECK((backend_ctx->kernel_convert_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_K_noshuffle", &err), err));
940+
CL_CHECK((backend_ctx->kernel_restore_block_q4_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_K_noshuffle", &err), err));
935941
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
936942
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
937943
CL_CHECK((backend_ctx->kernel_convert_block_q6_K_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K_noshuffle", &err), err));
@@ -2619,6 +2625,45 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
26192625
GGML_LOG_CONT(".");
26202626
}
26212627

2628+
// gemm_noshuffle_q4_k_f32
2629+
{
2630+
#ifdef GGML_OPENCL_EMBED_KERNELS
2631+
const std::string kernel_src {
2632+
#include "gemm_noshuffle_q4_k_f32.cl.h"
2633+
};
2634+
#else
2635+
const std::string kernel_src = read_file("gemm_noshuffle_q4_k_f32.cl");
2636+
#endif
2637+
cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2638+
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q4_k_f32", &err), err));
2639+
CL_CHECK(clReleaseProgram(prog));
2640+
GGML_LOG_CONT(".");
2641+
}
2642+
2643+
// gemv_noshuffle_q4_k_f32
2644+
{
2645+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2646+
" -cl-mad-enable ";
2647+
if (backend_ctx->has_vector_subgroup_broadcast) {
2648+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST ";
2649+
}
2650+
2651+
#ifdef GGML_OPENCL_EMBED_KERNELS
2652+
const std::string kernel_src {
2653+
#include "gemv_noshuffle_q4_k_f32.cl.h"
2654+
};
2655+
#else
2656+
const std::string kernel_src = read_file("gemv_noshuffle_q4_k_f32.cl");
2657+
#endif
2658+
2659+
cl_program prog = build_program_from_source(
2660+
backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts);
2661+
2662+
CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q4_k_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q4_k_f32", &err), err));
2663+
CL_CHECK(clReleaseProgram(prog));
2664+
GGML_LOG_CONT(".");
2665+
}
2666+
26222667
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
26232668
" -cl-mad-enable "
26242669
" -cl-fast-relaxed-math";
@@ -5060,12 +5105,25 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
50605105
CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
50615106
CL_CHECK(err);
50625107

5108+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
50635109
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
5110+
if (use_adreno_kernels(backend_ctx, tensor)) {
5111+
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
5112+
}
5113+
#else
5114+
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
5115+
#endif
5116+
5117+
cl_uchar mask_0F = 0x0F;
5118+
cl_uchar mask_F0 = 0xF0;
5119+
50645120
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
50655121
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
50665122
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
50675123
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
50685124
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm));
5125+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5126+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
50695127

50705128
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
50715129
size_t local_work_size[] = {64, 1, 1};
@@ -5076,6 +5134,20 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
50765134
CL_CHECK(clReleaseMemObject(data_device));
50775135

50785136
tensor->extra = extra;
5137+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5138+
if (use_adreno_kernels(backend_ctx, tensor)) {
5139+
5140+
int M = tensor->ne[1];
5141+
int K = tensor->ne[0];
5142+
5143+
GGML_ASSERT(K % 32 == 0);
5144+
5145+
// Transpose q, d, dm as ushort
5146+
transpose_2d_as_16b(backend_ctx, extra->q, extra->q, size_q, K/4, M);
5147+
transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/256, M);
5148+
transpose_2d_as_16b(backend_ctx, extra->dm, extra->dm, size_dm, K/256, M);
5149+
}
5150+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
50795151
return;
50805152
}
50815153
if (tensor->type == GGML_TYPE_Q6_K) {
@@ -5516,12 +5588,60 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
55165588
ggml_nbytes(tensor), NULL, &err);
55175589
CL_CHECK(err);
55185590

5591+
cl_uchar mask_0F = 0x0F;
5592+
cl_uchar mask_F0 = 0xF0;
5593+
5594+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
5595+
if (use_adreno_kernels(backend_ctx, tensor)) {
5596+
int M = tensor->ne[1];
5597+
int K = tensor->ne[0];
5598+
5599+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
5600+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
5601+
size_t size_dm = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
5602+
5603+
static ggml_cl_buffer buf_trans_q;
5604+
static ggml_cl_buffer buf_trans_d;
5605+
static ggml_cl_buffer buf_trans_dm;
5606+
5607+
buf_trans_q.allocate(backend_ctx->context, size_q);
5608+
buf_trans_d.allocate(backend_ctx->context, size_d);
5609+
buf_trans_dm.allocate(backend_ctx->context, size_dm);
5610+
5611+
// Transpose q, d, dm back
5612+
transpose_2d_as_16b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/4);
5613+
transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/256);
5614+
transpose_2d_as_16b(backend_ctx, extra->dm, buf_trans_dm.buffer, size_dm, M, K/256);
5615+
5616+
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K_noshuffle;
5617+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
5618+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
5619+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer));
5620+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_dm.buffer));
5621+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
5622+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5623+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
5624+
5625+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
5626+
size_t local_work_size[] = {1, 1, 1};
5627+
5628+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
5629+
global_work_size, local_work_size, 0, NULL, NULL));
5630+
CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset,
5631+
size, data, 0, NULL, NULL));
5632+
CL_CHECK(clReleaseMemObject(data_device));
5633+
return;
5634+
}
5635+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
5636+
55195637
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_K;
55205638
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
55215639
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->s));
55225640
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
55235641
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm));
55245642
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
5643+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F));
5644+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0));
55255645

55265646
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
55275647
size_t local_work_size[] = {1, 1, 1};
@@ -9688,6 +9808,192 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
96889808
#endif
96899809
}
96909810

9811+
static void ggml_cl_mul_mat_q4_k_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9812+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9813+
GGML_ASSERT(src0);
9814+
GGML_ASSERT(src0->extra);
9815+
GGML_ASSERT(src1);
9816+
GGML_ASSERT(src1->extra);
9817+
GGML_ASSERT(dst);
9818+
GGML_ASSERT(dst->extra);
9819+
9820+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9821+
9822+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
9823+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
9824+
ggml_tensor_extra_cl_q4_K * extra0_q4_k = (ggml_tensor_extra_cl_q4_K *)src0->extra;
9825+
9826+
cl_ulong offset1 = extra1->offset + src1->view_offs;
9827+
cl_ulong offsetd = extrad->offset + dst->view_offs;
9828+
9829+
const int ne00 = src0->ne[0];
9830+
const int ne01 = src0->ne[1];
9831+
9832+
const int ne1 = dst->ne[1];
9833+
9834+
GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
9835+
9836+
cl_context context = backend_ctx->context;
9837+
cl_kernel kernel;
9838+
9839+
cl_int err;
9840+
cl_image_format img_fmt;
9841+
cl_image_desc img_desc;
9842+
cl_buffer_region region;
9843+
9844+
int M = ne01;
9845+
int N = ne1;
9846+
int K = ne00;
9847+
9848+
cl_uchar mask_d6 = 0x3F;
9849+
cl_uchar mask_d4 = 0x0F;
9850+
cl_uchar mask_hi2 = 0xC0;
9851+
9852+
if (ne1 == 1) {
9853+
cl_mem q_img = nullptr;
9854+
cl_mem b_sub_buf = nullptr;
9855+
cl_mem b_img = nullptr;
9856+
9857+
// image for q
9858+
img_fmt = { CL_R, CL_UNSIGNED_INT32};
9859+
memset(&img_desc, 0, sizeof(img_desc));
9860+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9861+
img_desc.image_width = M * K / 2 / 4;
9862+
img_desc.buffer = extra0_q4_k->q;
9863+
CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9864+
9865+
// subbuffer for activations
9866+
region.origin = offset1;
9867+
region.size = K * N * sizeof(float);
9868+
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
9869+
9870+
// image for activations
9871+
img_fmt = {CL_RGBA, CL_FLOAT};
9872+
memset(&img_desc, 0, sizeof(img_desc));
9873+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9874+
img_desc.image_width = K * N / 4;
9875+
img_desc.buffer = b_sub_buf;
9876+
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9877+
9878+
kernel = backend_ctx->kernel_gemv_noshuffle_q4_k_f32;
9879+
9880+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img));
9881+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->d));
9882+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->dm));
9883+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->s));
9884+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img));
9885+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
9886+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
9887+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00));
9888+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01));
9889+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_d6));
9890+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_uchar), &mask_d4));
9891+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_hi2));
9892+
9893+
size_t local_work_size[3] = {64, 4, 1};
9894+
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1};
9895+
9896+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9897+
9898+
CL_CHECK(clReleaseMemObject(q_img));
9899+
CL_CHECK(clReleaseMemObject(b_sub_buf));
9900+
CL_CHECK(clReleaseMemObject(b_img));
9901+
} else {
9902+
9903+
cl_mem b_sub_buf = nullptr;
9904+
cl_mem b_sub_buf_trans = nullptr;
9905+
cl_mem b_img = nullptr;
9906+
cl_mem b_img_trans = nullptr;
9907+
9908+
// subbuffer for activations
9909+
region.origin = offset1;
9910+
region.size = K * N * sizeof(float);
9911+
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
9912+
9913+
// image for activations
9914+
img_fmt = {CL_RGBA, CL_FLOAT};
9915+
memset(&img_desc, 0, sizeof(img_desc));
9916+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9917+
img_desc.image_width = K * N / 4;
9918+
img_desc.buffer = b_sub_buf;
9919+
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
9920+
9921+
// pad N to multiple of 8
9922+
int extra_elements = N % 8;
9923+
int padding = 0;
9924+
if (extra_elements > 0){
9925+
padding = 8 - extra_elements;
9926+
}
9927+
9928+
// subbuffer for transposed activations
9929+
region.origin = 0;
9930+
region.size = K * (N + padding) * sizeof(float)/2;
9931+
backend_ctx->prealloc_act_trans.allocate(context, region.size);
9932+
CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
9933+
9934+
// image for transposed activations
9935+
img_fmt = {CL_RGBA, CL_HALF_FLOAT};
9936+
memset(&img_desc, 0, sizeof(img_desc));
9937+
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
9938+
img_desc.image_width = K * (N + padding) / 4;
9939+
img_desc.buffer = b_sub_buf_trans;
9940+
CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
9941+
9942+
// transpose activations
9943+
int height_B = N/4;
9944+
if (height_B == 0) {
9945+
height_B = 1;
9946+
}
9947+
int width_B = K/4;
9948+
int padded_height_B = (N + padding)/4;
9949+
9950+
kernel = backend_ctx->kernel_transpose_32_16;
9951+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
9952+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
9953+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
9954+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
9955+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
9956+
9957+
size_t local_work_size_t[2] = { 1, 16 };
9958+
size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
9959+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
9960+
9961+
// gemm
9962+
kernel = backend_ctx->kernel_gemm_noshuffle_q4_k_f32;
9963+
int padded_N = N + padding;
9964+
9965+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_k->q));
9966+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_k->s));
9967+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q4_k->d));
9968+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q4_k->dm));
9969+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans));
9970+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device));
9971+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd));
9972+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01));
9973+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &padded_N));
9974+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne00));
9975+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_int), &ne1));
9976+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_uchar), &mask_d6));
9977+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_uchar), &mask_d4));
9978+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_uchar), &mask_hi2));
9979+
9980+
size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1};
9981+
size_t local_work_size[3] = {1, 128, 1};
9982+
9983+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9984+
CL_CHECK(clReleaseMemObject(b_sub_buf));
9985+
CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
9986+
CL_CHECK(clReleaseMemObject(b_img));
9987+
CL_CHECK(clReleaseMemObject(b_img_trans));
9988+
}
9989+
#else
9990+
GGML_UNUSED(backend);
9991+
GGML_UNUSED(src0);
9992+
GGML_UNUSED(src1);
9993+
GGML_UNUSED(dst);
9994+
#endif
9995+
}
9996+
96919997
static void ggml_cl_mul_mat_q6_K_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
96929998
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
96939999
GGML_ASSERT(src0);
@@ -10014,6 +10320,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1001410320
return;
1001510321
}
1001610322

10323+
// q4_k x fp32
10324+
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
10325+
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
10326+
return;
10327+
}
10328+
1001710329
// q6_K x fp32
1001810330
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
1001910331
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);

0 commit comments

Comments
 (0)