Skip to content

Commit dc98e54

Browse files
committed
ggml-ve : Q4_K direct kernel — packed-fp32 pvfmad (+43% tg on 27B)
Adds a packed-FP32 variant that uses _vel_pvfmad_vvvvl (2 fp32 per 64-bit lane) to halve the FMA chain. Opt-in via GGML_VE_Q4K_STD_PACKED=1 (compatible with _STD_CHUNK + _DIRECT). Codex called this the biggest remaining win for direct Q4_K, and the bench bears that out on 27B Q4_K_M: Direct chunked (baseline) : 0.50 pp / 0.44 tg t/s Direct chunked + packed (NEW) : 0.68 pp / 0.63 tg t/s +36% pp, +43% tg On 1B Q4_K_M (3-run averages, high variance): Direct chunked : 20.25 pp / 9.00 tg Direct chunked + packed : 21.89 pp / 9.50 tg (+8% pp / +5% tg) How it works: - Per lane, pack (d_low, d_high) into one 64-bit dlane_pk word, similarly (-m_low, -m_high) into mlane_pk (negated for the pvfmad encoding w = -m + d*nib = d*nib - m). - Per byte position bp, build packed nibbles: low_nib = (qs >> 8bp) & 0x0F (bits 0..3) high_nib = (qs >> (8bp+4)) & 0x0F << 32 (bits 32..35) nib_pk = low_nib | high_nib - pvcvtsw converts packed int32 -> packed fp32. - pvfmad: w_pk = -m_pk + d_pk*nib_pk, then acc_pk = pvfmad(acc_pk, w_pk, x_pk). - Reduce the packed accumulator by extracting low+high halves of each lane and summing (pattern mirrors q4k_full_intrin.c:698-705). x_perm builder: new q4k_std_build_x_perm_packed_extern produces [bp][b][i] u64 layout, each u64 = (x_low | x_high << 32). Same total bytes as the two unpacked float arrays. One pass per matvec. Per chunk: - Before: 4 bp × 2 halves = 8 VL=cn*32 FMAs. - Now: 4 bp × 1 packed = 4 VL=cn*32 packed FMAs (each does 2 fp32 multiplies per lane = 8 total per cycle on the packed pipeline). Net: 2x arithmetic density, real win on FMA-bound paths. Standalone test_q4k_std_matvec ALL OK on packed variant, 12 shapes incl. K=17408; max_abs 5.7e-6 (tighter than unpacked 8.1e-6). Task ggml-org#63.
1 parent b9259d9 commit dc98e54

2 files changed

Lines changed: 203 additions & 4 deletions

File tree

ggml/src/ggml-ve/kernels-veda/q4k_std_dispatch.c

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,22 @@ extern float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
3535
const float *x_low_perm,
3636
const float *x_high_perm,
3737
int nb);
38+
extern float q4k_std_row_dot_chunked_packed_hdr_extern(const uint8_t *blk_row,
39+
const float *hdr_decoded_row,
40+
const uint64_t *x_pk_perm,
41+
int nb);
3842
extern void q4k_std_build_x_perm_extern(const float *x,
3943
float *x_low_perm,
4044
float *x_high_perm, int K);
45+
extern void q4k_std_build_x_perm_packed_extern(const float *x,
46+
uint64_t *x_pk_perm, int K);
4147

4248
/* Reusable per-matvec buffers; grow monotonically. */
43-
static float * g_xlo_perm = NULL;
44-
static float * g_xhi_perm = NULL;
45-
static size_t g_xperm_cap = 0;
49+
static float * g_xlo_perm = NULL;
50+
static float * g_xhi_perm = NULL;
51+
static size_t g_xperm_cap = 0;
52+
static uint64_t * g_xpk_perm = NULL; /* packed x_perm for pvfmad path */
53+
static size_t g_xpk_cap = 0;
4654

4755
/* Per-thread qs scratch pool. Sized for nb*128 bytes * nthr_cap. */
4856
static uint8_t * g_qs_pool = NULL;
@@ -125,7 +133,34 @@ uint64_t ve_q4k_matvec_std_hdr_hbm(uint64_t y_vptr, uint64_t W_vptr,
125133

126134
const size_t hdr_row_floats = (size_t) nb * 16; /* 16 fp32 per block */
127135
const int use_gather = (getenv("GGML_VE_Q4K_STD_GATHER") != NULL);
128-
if (use_gather) {
136+
const int use_packed = (getenv("GGML_VE_Q4K_STD_PACKED") != NULL);
137+
138+
if (use_packed) {
139+
/* Build packed x_perm (low|high<<32 per element). Same total
140+
* floats as the unpacked variant; just packed layout. */
141+
const size_t pk_need = (size_t) K * sizeof(float); /* same byte count
142+
* as 2 float arrays
143+
* combined (2*K*4 = K*8) */
144+
const size_t pk_need_bytes = (size_t) nb * 4 * 32 * sizeof(uint64_t);
145+
(void) pk_need;
146+
if (pk_need_bytes > g_xpk_cap) {
147+
if (g_xpk_perm) free(g_xpk_perm);
148+
g_xpk_perm = (uint64_t *) aligned_alloc(64, pk_need_bytes);
149+
g_xpk_cap = pk_need_bytes;
150+
if (g_xpk_perm == NULL) return 8;
151+
}
152+
q4k_std_build_x_perm_packed_extern(x, g_xpk_perm, (int) K);
153+
154+
#pragma omp parallel for num_threads(nthr)
155+
for (uint64_t m = 0; m < M; m++) {
156+
const uint8_t *blk_row = W + m * row_bytes;
157+
const float *hdr_row = hdr_all
158+
? hdr_all + m * hdr_row_floats
159+
: NULL;
160+
y[m] = q4k_std_row_dot_chunked_packed_hdr_extern(blk_row, hdr_row,
161+
g_xpk_perm, nb);
162+
}
163+
} else if (use_gather) {
129164
#pragma omp parallel for num_threads(nthr)
130165
for (uint64_t m = 0; m < M; m++) {
131166
const uint8_t *blk_row = W + m * row_bytes;

ggml/src/ggml-ve/kernels-veda/q4k_std_intrin.c

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,30 @@ void q4k_std_build_x_perm_extern(const float *x, float *x_low_perm,
9999
}
100100
}
101101

102+
/* PACKED variant of the x permute: per bp, nb*32 u64 where each u64 packs
103+
* (x_low, x_high) as low|high<<32. Used by the pvfmad kernel below. */
104+
void q4k_std_build_x_perm_packed_extern(const float *x, uint64_t *x_pk_perm, int K);
105+
106+
void q4k_std_build_x_perm_packed_extern(const float *x, uint64_t *x_pk_perm, int K) {
107+
const int nb = K / 256;
108+
for (int bp = 0; bp < 4; bp++) {
109+
uint64_t *xpk_bp = x_pk_perm + (size_t) bp * nb * 32;
110+
for (int b = 0; b < nb; b++) {
111+
const float *xb = x + (size_t) b * 256;
112+
for (int i = 0; i < 32; i++) {
113+
const int qq = i / 8;
114+
const int ii = i % 8;
115+
const float x_lo = xb[64 * qq + 4 * ii + bp];
116+
const float x_hi = xb[64 * qq + 32 + 4 * ii + bp];
117+
uint32_t lo_bits, hi_bits;
118+
memcpy(&lo_bits, &x_lo, 4);
119+
memcpy(&hi_bits, &x_hi, 4);
120+
xpk_bp[b * 32 + i] = ((uint64_t) hi_bits << 32) | lo_bits;
121+
}
122+
}
123+
}
124+
}
125+
102126
/* Inner per-row dot using pre-permuted x. blk_row points at row m's
103127
* first block. x_low_perm and x_high_perm are nb*128 floats each. */
104128
float q4k_std_row_dot_xperm_extern(const uint8_t *blk_row,
@@ -600,3 +624,143 @@ float q4k_std_row_dot_chunked_gather_hdr_extern(const uint8_t *blk_row,
600624

601625
return acc;
602626
}
627+
628+
/* ---- Packed pvfmad variant ---- *
629+
*
630+
* Packs low+high nibble FMAs into _vel_pvfmad_vvvvl (packed FP32 -- 2
631+
* elements per 64-bit lane). Per chunk:
632+
* - 4 byte-positions × 1 packed FMA chain = 4 packed FMAs (vs 8 in
633+
* the non-packed chunked kernel: 4 bp × 2 halves).
634+
* - dlane_pk[i] = pack(d_low, d_high), mlane_pk[i] = pack(-m_lo, -m_hi)
635+
* - x_pk[i] = pack(x_low, x_high) -- preloaded once per matvec
636+
* - nib_pk[i] = pack(low_nib_i, high_nib_i)
637+
* - w_pk = pvfmad(neg_m_pk, d_pk, nib_f_pk) # = d*nib - m (packed)
638+
* - acc_pk = pvfmad(acc_pk, w_pk, x_pk)
639+
*
640+
* Reduction: extract low and high halves of each lane, sum to scalar.
641+
*
642+
* Inputs:
643+
* x_pk_perm: packed x permute (low|high<<32 per bp×nb×32 layout)
644+
*/
645+
float q4k_std_row_dot_chunked_packed_hdr_extern(const uint8_t *blk_row,
646+
const float *hdr_decoded_row,
647+
const uint64_t *x_pk_perm,
648+
int nb);
649+
650+
float q4k_std_row_dot_chunked_packed_hdr_extern(const uint8_t *blk_row,
651+
const float *hdr_decoded_row,
652+
const uint64_t *x_pk_perm,
653+
int nb) {
654+
if (!g_qs_gather_init) q4k_std_init_gather_offsets();
655+
656+
/* Preload the offset vector at MAX VL (only used by gather path). */
657+
__vr off_v = _vel_vld_vssl(8, (void *) g_qs_gather_offsets, Q4K_STD_GATHER_VL);
658+
659+
float acc = 0.0f;
660+
661+
for (int chunk_start = 0; chunk_start < nb; chunk_start += Q4K_STD_CHUNK) {
662+
int cn = (nb - chunk_start) < Q4K_STD_CHUNK ? (nb - chunk_start) : Q4K_STD_CHUNK;
663+
const int VL = cn * 32;
664+
665+
/* qs gather (same as gather variant). */
666+
const uint64_t chunk_base = (uint64_t)(uintptr_t)(blk_row + (size_t) chunk_start * 144);
667+
__vr addrs = _vel_vsfa_vvssl(off_v, 0, chunk_base, VL);
668+
__vr qs_chunk = _vel_vgtlzx_vvssl(addrs, 0, 0, VL);
669+
670+
/* Header source: cached pre-decoded, else live decode. */
671+
float d_sub_chunk[Q4K_STD_CHUNK * 8];
672+
float m_sub_chunk[Q4K_STD_CHUNK * 8];
673+
if (hdr_decoded_row != NULL) {
674+
const float *hdr_chunk = hdr_decoded_row + (size_t) chunk_start * 16;
675+
for (int cb = 0; cb < cn; cb++) {
676+
const float *blk_hdr = hdr_chunk + (size_t) cb * 16;
677+
for (int s = 0; s < 8; s++) {
678+
d_sub_chunk[cb * 8 + s] = blk_hdr[s ];
679+
m_sub_chunk[cb * 8 + s] = blk_hdr[8 + s];
680+
}
681+
}
682+
} else {
683+
for (int cb = 0; cb < cn; cb++) {
684+
const uint8_t *blk = blk_row + (size_t)(chunk_start + cb) * 144;
685+
uint16_t d_raw, dmin_raw;
686+
memcpy(&d_raw, blk + 0, 2);
687+
memcpy(&dmin_raw, blk + 2, 2);
688+
const float d_super = h2f(d_raw);
689+
const float dmin_super = h2f(dmin_raw);
690+
const uint8_t *sc12 = blk + 4;
691+
for (int s = 0; s < 8; s++) {
692+
uint8_t sc, mn;
693+
q4k_sm(s, sc12, &sc, &mn);
694+
d_sub_chunk[cb * 8 + s] = d_super * (float) sc;
695+
m_sub_chunk[cb * 8 + s] = dmin_super * (float) mn;
696+
}
697+
}
698+
}
699+
700+
/* Build packed dlane/mlane (negated m for pvfmad).
701+
* dlane_pk[lane] = pack(d_low, d_high) = hi<<32 | lo bits
702+
* mlane_pk[lane] = pack(-m_low, -m_high) */
703+
uint64_t dlane_pk[256], mlane_pk[256];
704+
for (int cb = 0; cb < cn; cb++) {
705+
const float *d_blk = d_sub_chunk + (size_t) cb * 8;
706+
const float *m_blk = m_sub_chunk + (size_t) cb * 8;
707+
for (int q = 0; q < 4; q++) {
708+
const float d_l = d_blk[2 * q], d_h = d_blk[2 * q + 1];
709+
const float m_l = -m_blk[2 * q], m_h = -m_blk[2 * q + 1];
710+
uint32_t dl_b, dh_b, ml_b, mh_b;
711+
memcpy(&dl_b, &d_l, 4); memcpy(&dh_b, &d_h, 4);
712+
memcpy(&ml_b, &m_l, 4); memcpy(&mh_b, &m_h, 4);
713+
const uint64_t d_pk = ((uint64_t) dh_b << 32) | dl_b;
714+
const uint64_t m_pk = ((uint64_t) mh_b << 32) | ml_b;
715+
for (int j = 0; j < 8; j++) {
716+
const int lane = cb * 32 + q * 8 + j;
717+
dlane_pk[lane] = d_pk;
718+
mlane_pk[lane] = m_pk;
719+
}
720+
}
721+
}
722+
__vr d_pk_v = _vel_vld_vssl(8, (void *) dlane_pk, VL);
723+
__vr m_pk_v = _vel_vld_vssl(8, (void *) mlane_pk, VL);
724+
725+
__vr acc_pk = _vel_vbrdl_vsl(0UL, VL);
726+
__vr lo_mask = _vel_vbrdl_vsl(0x000000000000000FUL, VL);
727+
728+
for (int bp = 0; bp < 4; bp++) {
729+
/* nib_pk lane i = low_nib_i | (high_nib_i << 32).
730+
* low_nib_i = (qs >> 8bp) & 0x0F
731+
* high_nib_i = (qs >> (8bp+4)) & 0x0F -> shift left 32 */
732+
__vr shifted_lo = _vel_vsrl_vvsl(qs_chunk, 8 * bp, VL);
733+
__vr shifted_hi = _vel_vsrl_vvsl(qs_chunk, 8 * bp + 4, VL);
734+
__vr low_nib = _vel_vand_vvvl(shifted_lo, lo_mask, VL);
735+
__vr high_nib = _vel_vand_vvvl(shifted_hi, lo_mask, VL);
736+
__vr high_upper = _vel_vsll_vvsl(high_nib, 32, VL);
737+
__vr nib_pk = _vel_vor_vvvl (low_nib, high_upper, VL);
738+
739+
/* Packed int32 -> packed fp32. */
740+
__vr nib_f_pk = _vel_pvcvtsw_vvl(nib_pk, VL);
741+
742+
/* w_pk = -m + d*nib (packed FMA). */
743+
__vr w_pk = _vel_pvfmad_vvvvl(m_pk_v, d_pk_v, nib_f_pk, VL);
744+
745+
/* Load packed x. */
746+
__vr x_pk = _vel_vld_vssl(8,
747+
(void *)(x_pk_perm + (size_t) bp * nb * 32 + (size_t) chunk_start * 32), VL);
748+
749+
/* acc_pk += w_pk * x_pk (packed FMA). */
750+
acc_pk = _vel_pvfmad_vvvvl(acc_pk, w_pk, x_pk, VL);
751+
}
752+
753+
/* Reduce packed accumulator. Pattern mirrors canon's packed
754+
* matvec at q4k_full_intrin.c:698-705. */
755+
__vr lo32_mask = _vel_vbrdl_vsl(0x00000000FFFFFFFFUL, VL);
756+
__vr acc_lo32 = _vel_vand_vvvl(acc_pk, lo32_mask, VL);
757+
__vr acc_hi32 = _vel_vsrl_vvsl(acc_pk, 32, VL);
758+
acc_lo32 = _vel_vsll_vvsl(acc_lo32, 32, VL);
759+
__vr acc_hi32_up = _vel_vsll_vvsl(acc_hi32, 32, VL);
760+
__vr acc_sum = _vel_vfadds_vvvl(acc_lo32, acc_hi32_up, VL);
761+
acc_sum = _vel_vfsums_vvl(acc_sum, VL);
762+
acc += _vel_lvss_svs(acc_sum, 0);
763+
}
764+
765+
return acc;
766+
}

0 commit comments

Comments
 (0)