Skip to content

Commit 47337a9

Browse files
committed
ggml-ve : Q4_K direct kernel — make chunked+packed the default variant
Collapses the three opt-in flags (GGML_VE_Q4K_STD_CHUNK + GGML_VE_Q4K_STD_PACKED) into the default path. With just GGML_VE_Q4K_DIRECT=1 the kernel now runs the fastest measured variant (chunked VL=256 + packed pvfmad) instead of single-row VL=32. Env vars are now OVERRIDES for A/B testing, all forcing slower paths: GGML_VE_Q4K_STD_PLAIN=1 single-row VL=32 (old default) GGML_VE_Q4K_STD_TILE=1 8-row tile GGML_VE_Q4K_STD_NOPACK=1 chunked unpacked (scratch pack) GGML_VE_Q4K_STD_GATHER=1 chunked unpacked vgtlzx gather 27B Q4_K_M with GGML_VE_Q4K_DIRECT=1 + N_GT_1=1 (no other flags): 0.70 pp / 0.60 tg t/s -- same as the fully-flagged packed path, now the default. (Single-row default was 0.32/0.30.) Standalone test_q4k_std_matvec ALL OK on the new default path (packed numerics, max_abs 5.7e-6), 12 shapes incl. K=17408. This is a strict improvement to the direct path with no downside; direct vs canon routing is unchanged (still opt-in).
1 parent dc98e54 commit 47337a9

1 file changed

Lines changed: 26 additions & 10 deletions

File tree

ggml/src/ggml-ve/kernels-veda/q4k_std_dispatch.c

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,31 @@ uint64_t ve_q4k_matvec_std_hdr_hbm(uint64_t y_vptr, uint64_t W_vptr,
108108
if (g_xlo_perm == NULL || g_xhi_perm == NULL) return 5;
109109
}
110110

111-
q4k_std_build_x_perm_extern(x, g_xlo_perm, g_xhi_perm, (int) K);
112-
113-
/* GGML_VE_Q4K_STD_TILE=1 to try the 8-row tile (slower at the moment
114-
* due to register pressure -- left in for future tuning). */
115-
const int use_tile = (getenv("GGML_VE_Q4K_STD_TILE") != NULL);
116-
/* GGML_VE_Q4K_STD_CHUNK=1 to use cross-block chunking (VL up to 256). */
117-
const int use_chunk = (getenv("GGML_VE_Q4K_STD_CHUNK") != NULL);
111+
/* Variant selection. DEFAULT = chunked + packed (VL=256 pvfmad), the
112+
* fastest variant measured (27B Q4_K_M: 0.63 tg vs 0.30 single-row).
113+
* Env overrides force alternates, all for A/B testing / debugging:
114+
* GGML_VE_Q4K_STD_PLAIN=1 -> single-row VL=32 (old default)
115+
* GGML_VE_Q4K_STD_TILE=1 -> 8-row tile (register-pressure bound)
116+
* GGML_VE_Q4K_STD_NOPACK=1 -> chunked, unpacked (scratch pack)
117+
* GGML_VE_Q4K_STD_GATHER=1 -> chunked, unpacked, vgtlzx gather
118+
* (GGML_VE_Q4K_STD_CHUNK / _PACKED are accepted as explicit opt-ins
119+
* but are now the default, so they're no-ops unless an override
120+
* below disables them.) */
121+
const int force_plain = (getenv("GGML_VE_Q4K_STD_PLAIN") != NULL);
122+
const int force_tile = (getenv("GGML_VE_Q4K_STD_TILE") != NULL);
123+
const int force_nopack = (getenv("GGML_VE_Q4K_STD_NOPACK") != NULL);
124+
const int force_gather = (getenv("GGML_VE_Q4K_STD_GATHER") != NULL);
125+
126+
const int use_tile = force_tile;
127+
const int use_chunk = !force_plain && !force_tile;
128+
/* Packed is the default within chunked unless an unpacked override
129+
* (NOPACK or GATHER) is requested. */
130+
const int use_packed = use_chunk && !force_nopack && !force_gather;
131+
132+
/* Only build the perm layout the chosen path needs. */
133+
if (!use_packed) {
134+
q4k_std_build_x_perm_extern(x, g_xlo_perm, g_xhi_perm, (int) K);
135+
}
118136

119137
if (use_chunk) {
120138
/* Grow per-thread qs scratch. nb*128 bytes per thread. */
@@ -132,8 +150,6 @@ uint64_t ve_q4k_matvec_std_hdr_hbm(uint64_t y_vptr, uint64_t W_vptr,
132150
}
133151

134152
const size_t hdr_row_floats = (size_t) nb * 16; /* 16 fp32 per block */
135-
const int use_gather = (getenv("GGML_VE_Q4K_STD_GATHER") != NULL);
136-
const int use_packed = (getenv("GGML_VE_Q4K_STD_PACKED") != NULL);
137153

138154
if (use_packed) {
139155
/* Build packed x_perm (low|high<<32 per element). Same total
@@ -160,7 +176,7 @@ uint64_t ve_q4k_matvec_std_hdr_hbm(uint64_t y_vptr, uint64_t W_vptr,
160176
y[m] = q4k_std_row_dot_chunked_packed_hdr_extern(blk_row, hdr_row,
161177
g_xpk_perm, nb);
162178
}
163-
} else if (use_gather) {
179+
} else if (force_gather) {
164180
#pragma omp parallel for num_threads(nthr)
165181
for (uint64_t m = 0; m < M; m++) {
166182
const uint8_t *blk_row = W + m * row_bytes;

0 commit comments

Comments
 (0)