Skip to content

Commit 443c16a

Browse files
committed
opencl: enable tiled-wide q6_K/q4_K decode GEMV by default
Flip GGML_OPENCL_Q6K_GEMV_TILED and GGML_OPENCL_Q4K_GEMV_TILED from opt-in to default-on (opt out with =0), matching the GGML_OPENCL_Q6K_GEMV_O4 convention. The tiled lm_head/embed GEMV is bit-identical to the legacy path and gates only on long-vocab shapes (ne01>=32768, ne01%64==0). Verified on Adreno X2 (asus-gly), tg64 fa=0 warmed -r3: - Qwen3-1.7B-Q4_K_M: +10.3%/+5.9%/+3.5% @ d=0/4k/16k - Llama-3.2-1B-Q4_K_M: +3.8%/+2.7%/+0.5% @ d=0/4k/16k Greedy output byte-identical default vs =0.
1 parent e0ddf52 commit 443c16a

1 file changed

Lines changed: 11 additions & 9 deletions

File tree

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3139,8 +3139,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
31393139
GGML_LOG_CONT(".");
31403140
}
31413141

3142-
// gemv_noshuffle_q4_k_f32_tiled — tiled-wide canonical layout, opt-in via
3143-
// GGML_OPENCL_Q4K_GEMV_TILED=1 (separate convert + GEMV; weights via __global).
3142+
// gemv_noshuffle_q4_k_f32_tiled — tiled-wide canonical layout, default ON
3143+
// (opt out: GGML_OPENCL_Q4K_GEMV_TILED=0; separate convert + GEMV; weights via __global).
31443144
{
31453145
#ifdef GGML_OPENCL_EMBED_KERNELS
31463146
const std::string kernel_src {
@@ -3558,8 +3558,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
35583558
GGML_LOG_CONT(".");
35593559
}
35603560

3561-
// gemv_noshuffle_q6_k_f32_tiled — tiled-wide canonical layout, opt-in via
3562-
// GGML_OPENCL_Q6K_GEMV_TILED=1 (separate convert + GEMV; weights via __global).
3561+
// gemv_noshuffle_q6_k_f32_tiled — tiled-wide canonical layout, default ON
3562+
// (opt out: GGML_OPENCL_Q6K_GEMV_TILED=0; separate convert + GEMV; weights via __global).
35633563
{
35643564
#ifdef GGML_OPENCL_EMBED_KERNELS
35653565
const std::string kernel_src {
@@ -5678,12 +5678,13 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
56785678
return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
56795679
}
56805680

5681-
// Opt-in tiled-wide q6_K GEMV (default OFF). Both the convert (set_tensor) and
5682-
// the GEMV dispatch must agree on this so the buffer layout matches the kernel.
5681+
// Tiled-wide q6_K GEMV (default ON; opt out via GGML_OPENCL_Q6K_GEMV_TILED=0).
5682+
// Both the convert (set_tensor) and the GEMV dispatch must agree on this so the
5683+
// buffer layout matches the kernel.
56835684
inline bool q6k_gemv_tiled_enabled() {
56845685
static const bool en = []{
56855686
const char * e = std::getenv("GGML_OPENCL_Q6K_GEMV_TILED");
5686-
return e && e[0] != '\0' && e[0] != '0';
5687+
return !e || e[0] == '\0' || e[0] != '0';
56875688
}();
56885689
return en;
56895690
}
@@ -5695,11 +5696,12 @@ inline bool use_q6k_tiled(const ggml_tensor *tensor) {
56955696
tensor->ne[1] >= 32768 && tensor->ne[1] % 64 == 0;
56965697
}
56975698

5698-
// q4_K analog of the tiled-wide lm_head/embed GEMV (default OFF). Same gate.
5699+
// q4_K analog of the tiled-wide lm_head/embed GEMV (default ON; opt out via
5700+
// GGML_OPENCL_Q4K_GEMV_TILED=0). Same gate.
56995701
inline bool q4k_gemv_tiled_enabled() {
57005702
static const bool en = []{
57015703
const char * e = std::getenv("GGML_OPENCL_Q4K_GEMV_TILED");
5702-
return e && e[0] != '\0' && e[0] != '0';
5704+
return !e || e[0] == '\0' || e[0] != '0';
57035705
}();
57045706
return en;
57055707
}

0 commit comments

Comments
 (0)