Skip to content

Commit b9259d9

Browse files
committed
ggml-ve : Q4_K direct kernel — vgtlzx HBM gather (opt-in)
Adds a gather-based qs load variant that reads u32 lanes directly from raw HBM via _vel_vgtlzx_vvssl + _vel_vsfa_vvssl, eliminating the per-row scratch pack (16 vld+vst per block). Opt-in: GGML_VE_Q4K_STD_GATHER=1 (in addition to _DIRECT + _STD_CHUNK). How it works: - One-shot init of g_qs_gather_offsets[256] holding the byte offset pattern: (i/32)*144 + 16 + (i%32)*4 for i in 0..255. - Per chunk: chunk_base = row_start + chunk_start*144; abs_addrs = vsfa(off_v, shift=0, chunk_base, VL); qs_chunk = vgtlzx(abs_addrs, 0, 0, VL); - Address pattern is monotonic increasing (eight 128-byte runs separated by 16-byte block headers), so VE's gather hits a near-coalesced load -- not random-access cost. Saves nb vld + nb vst per row of HBM<->LLC traffic. Frees the g_qs_pool per-thread scratch buffer requirement. Measured: - Standalone test_q4k_std_matvec: ALL OK for both gather and no-gather variants on 12 shapes incl. K=17408. - 1B Q4_K_M: gather ~+12% pp, +3% tg over scratch-pack (within noise; high run-to-run variance). - 27B Q4_K_M N>1: 0.50/0.46 t/s (vs 0.50/0.44 scratch-pack). Modest +5% tg. The win is real but modest because the scratch pack was already fast (sequential vector vld+vst at MVL). The gather route gives more headroom for future kernels that may want to read partial chunks or skip blocks, but for the current dense chunked path it's basically a wash. Task ggml-org#64.
1 parent bff568d commit b9259d9

2 files changed

Lines changed: 168 additions & 7 deletions

File tree

ggml/src/ggml-ve/kernels-veda/q4k_std_dispatch.c

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ extern float q4k_std_row_dot_chunked_hdr_extern(const uint8_t *blk_row,
3030
const float *x_high_perm,
3131
uint8_t *qs_scratch,
3232
int nb);
33+
extern float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
34+
const float *hdr_decoded_row,
35+
const float *x_low_perm,
36+
const float *x_high_perm,
37+
int nb);
3338
extern void q4k_std_build_x_perm_extern(const float *x,
3439
float *x_low_perm,
3540
float *x_high_perm, int K);
@@ -119,18 +124,31 @@ uint64_t ve_q4k_matvec_std_hdr_hbm(uint64_t y_vptr, uint64_t W_vptr,
119124
}
120125

121126
const size_t hdr_row_floats = (size_t) nb * 16; /* 16 fp32 per block */
122-
#pragma omp parallel num_threads(nthr)
123-
{
124-
int tid = omp_get_thread_num();
125-
uint8_t *qs_scratch = g_qs_pool + (size_t) tid * g_qs_per_thread;
126-
#pragma omp for
127+
const int use_gather = (getenv("GGML_VE_Q4K_STD_GATHER") != NULL);
128+
if (use_gather) {
129+
#pragma omp parallel for num_threads(nthr)
127130
for (uint64_t m = 0; m < M; m++) {
128131
const uint8_t *blk_row = W + m * row_bytes;
129132
const float *hdr_row = hdr_all
130133
? hdr_all + m * hdr_row_floats
131134
: NULL;
132-
y[m] = q4k_std_row_dot_chunked_hdr_extern(blk_row, hdr_row,
133-
g_xlo_perm, g_xhi_perm, qs_scratch, nb);
135+
y[m] = q4k_std_row_dot_chunked_gather_hdr_extern(blk_row, hdr_row,
136+
g_xlo_perm, g_xhi_perm, nb);
137+
}
138+
} else {
139+
#pragma omp parallel num_threads(nthr)
140+
{
141+
int tid = omp_get_thread_num();
142+
uint8_t *qs_scratch = g_qs_pool + (size_t) tid * g_qs_per_thread;
143+
#pragma omp for
144+
for (uint64_t m = 0; m < M; m++) {
145+
const uint8_t *blk_row = W + m * row_bytes;
146+
const float *hdr_row = hdr_all
147+
? hdr_all + m * hdr_row_floats
148+
: NULL;
149+
y[m] = q4k_std_row_dot_chunked_hdr_extern(blk_row, hdr_row,
150+
g_xlo_perm, g_xhi_perm, qs_scratch, nb);
151+
}
134152
}
135153
}
136154
} else if (use_tile) {

ggml/src/ggml-ve/kernels-veda/q4k_std_intrin.c

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,26 @@ float q4k_std_row_dot_chunked_hdr_extern(const uint8_t *blk_row,
331331
uint8_t *qs_scratch,
332332
int nb);
333333

334+
/* GATHER variant: skips the scratch pack and uses vgtlzx + vsfa to load
335+
* cn*32 u32 lanes directly from raw HBM. Address pattern per chunk:
336+
* addr[i] = chunk_base + (i/32)*144 + 16 + (i%32)*4
337+
* with chunk_base = blk_row + chunk_start * 144.
338+
*
339+
* The offset vector (i/32)*144 + 16 + (i%32)*4 doesn't depend on
340+
* chunk_start, so it's precomputed once into g_qs_gather_offsets (a
341+
* static MAX_VL=256 u64 array) and loaded per-chunk. */
342+
float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
343+
const float *hdr_decoded_row,
344+
const float *x_low_perm,
345+
const float *x_high_perm,
346+
int nb);
347+
348+
float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
349+
const float *hdr_decoded_row,
350+
const float *x_low_perm,
351+
const float *x_high_perm,
352+
int nb);
353+
334354
float q4k_std_row_dot_chunked_extern(const uint8_t *blk_row,
335355
const float *x_low_perm,
336356
const float *x_high_perm,
@@ -457,3 +477,126 @@ float q4k_std_row_dot_chunked_hdr_extern(const uint8_t *blk_row,
457477

458478
return acc;
459479
}
480+
481+
/* ---- Gather variant ---- */
482+
483+
/* Static offset vector. Initialised lazily on first call. Pattern:
484+
* off[i] = (i/32)*144 + 16 + (i%32)*4 for i in 0..MAX-1
485+
* MAX = Q4K_STD_CHUNK*32 = 256 (matches MVL). */
486+
#define Q4K_STD_GATHER_VL (Q4K_STD_CHUNK * 32)
487+
static uint64_t g_qs_gather_offsets[Q4K_STD_GATHER_VL] __attribute__((aligned(64)));
488+
static int g_qs_gather_init = 0;
489+
490+
static void q4k_std_init_gather_offsets(void) {
491+
for (int i = 0; i < Q4K_STD_GATHER_VL; i++) {
492+
const int cb = i / 32;
493+
const int ii = i % 32;
494+
g_qs_gather_offsets[i] = (uint64_t)(cb * 144 + 16 + ii * 4);
495+
}
496+
g_qs_gather_init = 1;
497+
}
498+
499+
float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
500+
const float *hdr_decoded_row,
501+
const float *x_low_perm,
502+
const float *x_high_perm,
503+
int nb) {
504+
if (!g_qs_gather_init) q4k_std_init_gather_offsets();
505+
506+
/* Preload the offset vector at MAX VL. */
507+
__vr off_v = _vel_vld_vssl(8, (void *)g_qs_gather_offsets, Q4K_STD_GATHER_VL);
508+
509+
float acc = 0.0f;
510+
511+
for (int chunk_start = 0; chunk_start < nb; chunk_start += Q4K_STD_CHUNK) {
512+
int cn = (nb - chunk_start) < Q4K_STD_CHUNK ? (nb - chunk_start) : Q4K_STD_CHUNK;
513+
const int VL = cn * 32;
514+
515+
/* Build chunk-relative absolute addresses: chunk_base + offsets. */
516+
const uint64_t chunk_base = (uint64_t)(uintptr_t)(blk_row + (size_t) chunk_start * 144);
517+
__vr addrs = _vel_vsfa_vvssl(off_v, /*shift=*/0, chunk_base, VL);
518+
519+
/* Gather load: each lane reads u32 at addrs[lane]. */
520+
__vr qs_chunk = _vel_vgtlzx_vvssl(addrs, /*sw=*/0, /*sz=*/0, VL);
521+
__vr mask = _vel_vbrdl_vsl(0x0FUL, VL);
522+
523+
/* Headers: same logic as the scratch-pack variant. */
524+
float d_sub_chunk[Q4K_STD_CHUNK * 8];
525+
float m_sub_chunk[Q4K_STD_CHUNK * 8];
526+
if (hdr_decoded_row != NULL) {
527+
const float *hdr_chunk = hdr_decoded_row + (size_t) chunk_start * 16;
528+
for (int cb = 0; cb < cn; cb++) {
529+
const float *blk_hdr = hdr_chunk + (size_t) cb * 16;
530+
for (int s = 0; s < 8; s++) {
531+
d_sub_chunk[cb * 8 + s] = blk_hdr[s ];
532+
m_sub_chunk[cb * 8 + s] = blk_hdr[8 + s];
533+
}
534+
}
535+
} else {
536+
for (int cb = 0; cb < cn; cb++) {
537+
const uint8_t *blk = blk_row + (size_t)(chunk_start + cb) * 144;
538+
uint16_t d_raw, dmin_raw;
539+
memcpy(&d_raw, blk + 0, 2);
540+
memcpy(&dmin_raw, blk + 2, 2);
541+
const float d_super = h2f(d_raw);
542+
const float dmin_super = h2f(dmin_raw);
543+
const uint8_t *sc12 = blk + 4;
544+
for (int s = 0; s < 8; s++) {
545+
uint8_t sc, mn;
546+
q4k_sm(s, sc12, &sc, &mn);
547+
d_sub_chunk[cb * 8 + s] = d_super * (float) sc;
548+
m_sub_chunk[cb * 8 + s] = dmin_super * (float) mn;
549+
}
550+
}
551+
}
552+
553+
float dlane_lo[256], mlane_lo[256], dlane_hi[256], mlane_hi[256];
554+
for (int cb = 0; cb < cn; cb++) {
555+
const float *d_blk = d_sub_chunk + (size_t) cb * 8;
556+
const float *m_blk = m_sub_chunk + (size_t) cb * 8;
557+
for (int q = 0; q < 4; q++) {
558+
const float d_l = d_blk[2 * q], m_l = m_blk[2 * q];
559+
const float d_h = d_blk[2 * q + 1], m_h = m_blk[2 * q + 1];
560+
for (int j = 0; j < 8; j++) {
561+
const int lane = cb * 32 + q * 8 + j;
562+
dlane_lo[lane] = d_l;
563+
mlane_lo[lane] = m_l;
564+
dlane_hi[lane] = d_h;
565+
mlane_hi[lane] = m_h;
566+
}
567+
}
568+
}
569+
__vr dlv = _vel_vldu_vssl(4, (void *) dlane_lo, VL);
570+
__vr mlv = _vel_vldu_vssl(4, (void *) mlane_lo, VL);
571+
__vr dhv = _vel_vldu_vssl(4, (void *) dlane_hi, VL);
572+
__vr mhv = _vel_vldu_vssl(4, (void *) mlane_hi, VL);
573+
574+
__vr acc_v = _vel_vbrds_vsl(0.0f, VL);
575+
576+
for (int bp = 0; bp < 4; bp++) {
577+
__vr shifted = _vel_vsrl_vvsl(qs_chunk, 8 * bp, VL);
578+
__vr nib_lo = _vel_vand_vvvl(shifted, mask, VL);
579+
__vr nib_hi = _vel_vand_vvvl(_vel_vsrl_vvsl(shifted, 4, VL), mask, VL);
580+
__vr nlf = _vel_vcvtsw_vvl(nib_lo, VL);
581+
__vr nhf = _vel_vcvtsw_vvl(nib_hi, VL);
582+
583+
__vr xl = _vel_vldu_vssl(4,
584+
(void *)(x_low_perm + (size_t) bp * nb * 32 + (size_t) chunk_start * 32), VL);
585+
__vr xh = _vel_vldu_vssl(4,
586+
(void *)(x_high_perm + (size_t) bp * nb * 32 + (size_t) chunk_start * 32), VL);
587+
588+
__vr w_lo = _vel_vfmuls_vvvl(dlv, nlf, VL);
589+
w_lo = _vel_vfsubs_vvvl(w_lo, mlv, VL);
590+
acc_v = _vel_vfmads_vvvvl(acc_v, w_lo, xl, VL);
591+
592+
__vr w_hi = _vel_vfmuls_vvvl(dhv, nhf, VL);
593+
w_hi = _vel_vfsubs_vvvl(w_hi, mhv, VL);
594+
acc_v = _vel_vfmads_vvvvl(acc_v, w_hi, xh, VL);
595+
}
596+
597+
__vr red = _vel_vfsums_vvl(acc_v, VL);
598+
acc += _vel_lvss_svs(red, 0);
599+
}
600+
601+
return acc;
602+
}

0 commit comments

Comments
 (0)