opencl: enable tiled-wide q6_K/q4_K decode GEMV by default

wanghqc · wanghqc · commit 443c16afed91 · 2026-05-30T19:01:10.000-07:00
Flip GGML_OPENCL_Q6K_GEMV_TILED and GGML_OPENCL_Q4K_GEMV_TILED from opt-in
to default-on (opt out with =0), matching the GGML_OPENCL_Q6K_GEMV_O4
convention. The tiled lm_head/embed GEMV is bit-identical to the legacy
path and gates only on long-vocab shapes (ne01&gt;=32768, ne01%64==0).

Verified on Adreno X2 (asus-gly), tg64 fa=0 warmed -r3:
- Qwen3-1.7B-Q4_K_M: +10.3%/+5.9%/+3.5% @ d=0/4k/16k
- Llama-3.2-1B-Q4_K_M: +3.8%/+2.7%/+0.5% @ d=0/4k/16k
Greedy output byte-identical default vs =0.
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3139,8 +3139,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
         GGML_LOG_CONT(".");
     }
 
-    // gemv_noshuffle_q4_k_f32_tiled — tiled-wide canonical layout, opt-in via
-    // GGML_OPENCL_Q4K_GEMV_TILED=1 (separate convert + GEMV; weights via __global).
+    // gemv_noshuffle_q4_k_f32_tiled — tiled-wide canonical layout, default ON
+    // (opt out: GGML_OPENCL_Q4K_GEMV_TILED=0; separate convert + GEMV; weights via __global).
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
@@ -3558,8 +3558,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
         GGML_LOG_CONT(".");
     }
 
-    // gemv_noshuffle_q6_k_f32_tiled — tiled-wide canonical layout, opt-in via
-    // GGML_OPENCL_Q6K_GEMV_TILED=1 (separate convert + GEMV; weights via __global).
+    // gemv_noshuffle_q6_k_f32_tiled — tiled-wide canonical layout, default ON
+    // (opt out: GGML_OPENCL_Q6K_GEMV_TILED=0; separate convert + GEMV; weights via __global).
     {
 #ifdef GGML_OPENCL_EMBED_KERNELS
         const std::string kernel_src {
@@ -5678,12 +5678,13 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
     return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
 }
 
-// Opt-in tiled-wide q6_K GEMV (default OFF). Both the convert (set_tensor) and
-// the GEMV dispatch must agree on this so the buffer layout matches the kernel.
+// Tiled-wide q6_K GEMV (default ON; opt out via GGML_OPENCL_Q6K_GEMV_TILED=0).
+// Both the convert (set_tensor) and the GEMV dispatch must agree on this so the
+// buffer layout matches the kernel.
 inline bool q6k_gemv_tiled_enabled() {
     static const bool en = []{
         const char * e = std::getenv("GGML_OPENCL_Q6K_GEMV_TILED");
-        return e && e[0] != '\0' && e[0] != '0';
+        return !e || e[0] == '\0' || e[0] != '0';
     }();
     return en;
 }
@@ -5695,11 +5696,12 @@ inline bool use_q6k_tiled(const ggml_tensor *tensor) {
            tensor->ne[1] >= 32768 && tensor->ne[1] % 64 == 0;
 }
 
-// q4_K analog of the tiled-wide lm_head/embed GEMV (default OFF). Same gate.
+// q4_K analog of the tiled-wide lm_head/embed GEMV (default ON; opt out via
+// GGML_OPENCL_Q4K_GEMV_TILED=0). Same gate.
 inline bool q4k_gemv_tiled_enabled() {
     static const bool en = []{
         const char * e = std::getenv("GGML_OPENCL_Q4K_GEMV_TILED");
-        return e && e[0] != '\0' && e[0] != '0';
+        return !e || e[0] == '\0' || e[0] != '0';
     }();
     return en;
 }