Skip to content

Commit 63e66fd

Browse files
authored
opencl: use flat variants of q4_K and q6_K gemv for very large M (ggml-org#24006)
1 parent 5c394fd commit 63e66fd

1 file changed

Lines changed: 23 additions & 8 deletions

File tree

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4950,6 +4950,21 @@ inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backen
49504950
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
49514951
}
49524952

4953+
static inline bool use_flat_gemv_for_large_m_q4_K(const ggml_tensor *tensor) {
4954+
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
4955+
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
4956+
// note that this forces large M weights to use LM GEMM.
4957+
return tensor->ne[1] >= 32768 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
4958+
}
4959+
4960+
static inline bool use_flat_gemv_for_large_m_q6_K(const ggml_tensor *tensor) {
4961+
// gemv_noshuffle variant perf drops for large M, use flat variant for large M.
4962+
// threshold is well above typical hidden/FFN dims, but below typical vocab sizes.
4963+
// q6_K flat gemv is worse for smaller K; 2048 seems to be a reasonable threshold.
4964+
// note that this forces large M weights to use LM GEMM.
4965+
return tensor->ne[1] >= 32768 && tensor->ne[0] >= 2048 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
4966+
}
4967+
49534968
static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
49544969
ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
49554970
ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
@@ -6595,7 +6610,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
65956610

65966611
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
65976612
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K;
6598-
if (use_adreno_kernels(backend_ctx, tensor)) {
6613+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
65996614
kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle;
66006615
}
66016616
#else
@@ -6623,7 +6638,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
66236638

66246639
tensor->extra = extra;
66256640
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
6626-
if (use_adreno_kernels(backend_ctx, tensor)) {
6641+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
66276642

66286643
int M = tensor->ne[1];
66296644
int K = tensor->ne[0];
@@ -6923,7 +6938,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
69236938
cl_kernel kernel;
69246939
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
69256940
kernel = backend_ctx->kernel_convert_block_q6_K;
6926-
if (use_adreno_kernels(backend_ctx, tensor)) {
6941+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
69276942
kernel = backend_ctx->kernel_convert_block_q6_K_noshuffle;
69286943
}
69296944
#else
@@ -6956,7 +6971,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
69566971
tensor->extra = extra;
69576972

69586973
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
6959-
if (use_adreno_kernels(backend_ctx, tensor)) {
6974+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
69606975
cl_int M = tensor->ne[1]; // ne01
69616976
cl_int K = tensor->ne[0]; // ne00
69626977

@@ -7599,7 +7614,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
75997614
CL_CHECK(clReleaseMemObject(data_device));
76007615
return;
76017616
}
7602-
if (use_adreno_kernels(backend_ctx, tensor)) {
7617+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q4_K(tensor)) {
76037618
int M = tensor->ne[1];
76047619
int K = tensor->ne[0];
76057620

@@ -7820,7 +7835,7 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
78207835
CL_CHECK(clReleaseMemObject(data_device));
78217836
return;
78227837
}
7823-
if (use_adreno_kernels(backend_ctx, tensor)) {
7838+
if (use_adreno_kernels(backend_ctx, tensor) && !use_flat_gemv_for_large_m_q6_K(tensor)) {
78247839
static ggml_cl_buffer buf_trans_ql;
78257840
static ggml_cl_buffer buf_trans_qh;
78267841
static ggml_cl_buffer buf_trans_s;
@@ -13213,13 +13228,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
1321313228
}
1321413229

1321513230
// q4_k x fp32
13216-
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32) {
13231+
if (src0t == GGML_TYPE_Q4_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q4_K(src0)) {
1321713232
ggml_cl_mul_mat_q4_k_f32_adreno(backend, src0, src1, dst);
1321813233
return;
1321913234
}
1322013235

1322113236
// q6_K x fp32
13222-
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32) {
13237+
if (src0t == GGML_TYPE_Q6_K && src1t == GGML_TYPE_F32 && !use_flat_gemv_for_large_m_q6_K(src0)) {
1322313238
ggml_cl_mul_mat_q6_K_f32_adreno(backend, src0, src1, dst);
1322413239
return;
1322513240
}

0 commit comments

Comments
 (0)