@@ -4950,6 +4950,21 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
49504950 return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
49514951}
49524952
4953+ static inline bool use_flat_gemv_for_large_m_q4_K(const ggml_tensor *tensor) {
4954+ // gemv_noshuffle variant perf drops for large M, use flat variant for large M.
4955+ // threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
4956+ // note that this forces large M weights to use LM GEMM.
4957+ return tensor->ne[1] >= 32768 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
4958+ }
4959+
4960+ static inline bool use_flat_gemv_for_large_m_q6_K(const ggml_tensor *tensor) {
4961+ // gemv_noshuffle variant perf drops for large M, use flat variant for large M.
4962+ // threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
4963+ // q6_K flat gemv is worse for smaller K; 2048 seems to be a reasonable threshold.
4964+ // note that this forces large M weights to use LM GEMM.
4965+ return tensor->ne[1] >= 32768 && tensor->ne[0] >= 2048 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
4966+ }
4967+
49534968static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
49544969 ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
49554970 ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
@@ -6595,7 +6610,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
65956610
65966611#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
65976612 cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
6598- if (use_adreno_kernels(backend_ctx, tensor)) {
6613+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor) ) {
65996614 kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
66006615 }
66016616#else
@@ -6623,7 +6638,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
66236638
66246639 tensor->extra = extra;
66256640#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
6626- if (use_adreno_kernels(backend_ctx, tensor)) {
6641+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor) ) {
66276642
66286643 int M = tensor->ne[1];
66296644 int K = tensor->ne[0];
@@ -6923,7 +6938,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
69236938 cl_kernel kernel;
69246939#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
69256940 kernel = backend_ctx->kernel_convert_block_q6_K;
6926- if (use_adreno_kernels(backend_ctx, tensor)) {
6941+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor) ) {
69276942 kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
69286943 }
69296944#else
@@ -6956,7 +6971,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
69566971 tensor->extra = extra;
69576972
69586973#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
6959- if (use_adreno_kernels(backend_ctx, tensor)) {
6974+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor) ) {
69606975 cl_int M = tensor->ne[1]; // ne01
69616976 cl_int K = tensor->ne[0]; // ne00
69626977
@@ -7599,7 +7614,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
75997614 CL_CHECK(clReleaseMemObject(data_device));
76007615 return;
76017616 }
7602- if (use_adreno_kernels(backend_ctx, tensor)) {
7617+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor) ) {
76037618 int M = tensor->ne[1];
76047619 int K = tensor->ne[0];
76057620
@@ -7820,7 +7835,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
78207835 CL_CHECK(clReleaseMemObject(data_device));
78217836 return;
78227837 }
7823- if (use_adreno_kernels(backend_ctx, tensor)) {
7838+ if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor) ) {
78247839 static ggml_cl_buffer buf_trans_ql;
78257840 static ggml_cl_buffer buf_trans_qh;
78267841 static ggml_cl_buffer buf_trans_s;
@@ -13213,13 +13228,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1321313228 }
1321413229
1321513230 // q4_k x fp32
13216- if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
13231+ if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q4_K(src0) ) {
1321713232 ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
1321813233 return;
1321913234 }
1322013235
1322113236 // q6_K x fp32
13222- if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
13237+ if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q6_K(src0) ) {
1322313238 ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
1322413239 return;
1322513240 }
0 commit comments