Add 8D local-scale and AW refinement for PQ3_K/PQ4_K

mengqin · mengqin · commit dc60b163c387 · 2026-04-30T12:43:39.000-07:00
- switch PQ3_K/PQ4_K to per-8D local scales
- add greedy R2P1 imatrix-aware refinement on the quant path
- update CUDA mmq, vecdot and dequant support for the new layout
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -405,19 +405,19 @@ typedef struct {
 static_assert(sizeof(block_pq2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong pq2_K block size/padding");
 
 typedef struct {
-    ggml_half d[2];
-    uint8_t scales[K_SCALE_SIZE];
+    ggml_half d[2];            // master band scales for two 128-wide bands
+    uint8_t scales[QK_K/16];   // 4-bit local scales for each 8D sub-block
     uint8_t hmask[QK_K/8];
     uint8_t qs[QK_K/4];
 } block_pq3_K;
-static_assert(sizeof(block_pq3_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/8 + QK_K/4, "wrong pq3_K block size/padding");
+static_assert(sizeof(block_pq3_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/8 + QK_K/4, "wrong pq3_K block size/padding");
 
 typedef struct {
-    ggml_half d[2];
-    uint8_t scales[K_SCALE_SIZE];
+    ggml_half d[2];            // master band scales for two 128-wide bands
+    uint8_t scales[QK_K/16];   // 4-bit local scales for each 8D sub-block
     uint8_t qs[QK_K/2];
 } block_pq4_K;
-static_assert(sizeof(block_pq4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong pq4_K block size/padding");
+static_assert(sizeof(block_pq4_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/2, "wrong pq4_K block size/padding");
 
 // 5-bit quantization
 // 8 blocks of 32 elements each
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -995,14 +995,14 @@ template<>
 struct ggml_cuda_type_traits<GGML_TYPE_PQ3_K> {
     static constexpr int qk = QK_K;
     static constexpr int qr = 1;
-    static constexpr int qi = 16;
+    static constexpr int qi = GGML_PQ3_K_SUBBLOCK_COUNT;
 };
 
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_PQ4_K> {
     static constexpr int qk = QK_K;
     static constexpr int qr = 1;
-    static constexpr int qi = 16;
+    static constexpr int qi = GGML_PQ4_K_SUBBLOCK_COUNT;
 };
 
 template<>
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -23,7 +23,7 @@ typedef void (*mmq_write_back_t)(const float * __restrict__ sum, const int32_t *
 
 template <ggml_type type>
 static constexpr __host__ __device__ bool mmq_type_has_mma() {
-    return type != GGML_TYPE_PQ2_K;
+    return type != GGML_TYPE_PQ2_K && type != GGML_TYPE_PQ3_K && type != GGML_TYPE_PQ4_K;
 }
 
 template <ggml_type type>
@@ -111,20 +111,22 @@ static constexpr __device__ int mmq_get_granularity_device_for_type(const int mm
 
 template <ggml_type type>
 static void mmq_log_selected_path_once(const int cc, const int mmq_x, const int mmq_y) {
-    if constexpr (type != GGML_TYPE_PQ2_K) {
+    if constexpr (type != GGML_TYPE_PQ2_K && type != GGML_TYPE_PQ3_K && type != GGML_TYPE_PQ4_K) {
         return;
     }
 
-    static const bool enabled = getenv("GGML_CUDA_LOG_PQ2_K_MMQ") != nullptr;
+    static const bool enabled = getenv("GGML_CUDA_LOG_PQK_MMQ") != nullptr;
     static bool logged = false;
 
     if (!enabled || logged) {
         return;
     }
 
     logged = true;
-    GGML_LOG_INFO("%s: PQ2_K MMQ using %s path (cc=%d, mmq_x=%d, mmq_y=%d)\n",
-            __func__, mmq_uses_mma_host<type>(cc) ? "mma" : "dp4a", cc, mmq_x, mmq_y);
+    const char * type_name = type == GGML_TYPE_PQ2_K ? "PQ2_K" : (type == GGML_TYPE_PQ3_K ? "PQ3_K" : "PQ4_K");
+    GGML_LOG_INFO("%s: %s MMQ using %s path (cc=%d, mmq_x=%d, mmq_y=%d)\n",
+            __func__, type_name,
+            mmq_uses_mma_host<type>(cc) ? "mma" : "dp4a", cc, mmq_x, mmq_y);
 }
 
 enum mmq_q8_1_ds_layout {
@@ -308,8 +310,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
         case GGML_TYPE_PQ3_0:   return MMQ_DP4A_TXS_Q8_0;
         case GGML_TYPE_PQ4_0:   return MMQ_DP4A_TXS_Q8_0;
         case GGML_TYPE_PQ2_K:   return MMQ_DP4A_TXS_PQ2_K;
-        case GGML_TYPE_PQ3_K:   return MMQ_DP4A_TXS_Q8_0_16;
-        case GGML_TYPE_PQ4_K:   return MMQ_DP4A_TXS_Q8_0_16;
+        case GGML_TYPE_PQ3_K:   return MMQ_DP4A_TXS_PQ2_K;
+        case GGML_TYPE_PQ4_K:   return MMQ_DP4A_TXS_PQ2_K;
         case GGML_TYPE_Q4_0:    return MMQ_DP4A_TXS_Q4_0;
         case GGML_TYPE_Q4_1:    return MMQ_DP4A_TXS_Q4_1;
         case GGML_TYPE_Q5_0:    return MMQ_DP4A_TXS_Q8_0;
@@ -1455,6 +1457,132 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
     }
 }
 
+template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_pq3_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    // PQ3_K uses the same 8D local-scale shared-memory layout as PQ2_K.
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_PQ3_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_d  = (half2 *) (x_qs + txs.qs);
+
+    constexpr int threads_per_row = 32;
+    constexpr int nrows = warp_size / threads_per_row;
+    const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
+        int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_pq3_K * bxi = (const block_pq3_K *) x + kbx0 + i*stride;
+        const int elem0 = 4*txi;
+        const int elem1 = 128 + 4*txi;
+        const uint8_t high0 = (bxi->hmask[elem0 >> 3] >> (elem0 & 7)) & 0x0Fu;
+        const uint8_t high1 = (bxi->hmask[elem1 >> 3] >> (elem1 & 7)) & 0x0Fu;
+        const uint8_t qb0 = bxi->qs[txi];
+        const uint8_t qb1 = bxi->qs[MMQ_TILE_NE_K + txi];
+        const int q4_0 = ((((qb0 >> 0) & 0x03u) | ((high0 & 0x01u) << 2)) <<  0)
+                       | ((((qb0 >> 2) & 0x03u) | ((high0 & 0x02u) << 1)) <<  4)
+                       | ((((qb0 >> 4) & 0x03u) |  (high0 & 0x04u))       <<  8)
+                       | ((((qb0 >> 6) & 0x03u) | ((high0 & 0x08u) >> 1)) << 12);
+        const int q4_1 = ((((qb1 >> 0) & 0x03u) | ((high1 & 0x01u) << 2)) <<  0)
+                       | ((((qb1 >> 2) & 0x03u) | ((high1 & 0x02u) << 1)) <<  4)
+                       | ((((qb1 >> 4) & 0x03u) |  (high1 & 0x04u))       <<  8)
+                       | ((((qb1 >> 6) & 0x03u) | ((high1 & 0x08u) >> 1)) << 12);
+        const int2 vp = get_int_from_table_16(q4_0 | (q4_1 << 16), PQK_DP4A_VAL_3BIT_16);
+        const int qs0 = __byte_perm(vp.x, vp.y, 0x5140);
+        const int qs1 = __byte_perm(vp.x, vp.y, 0x7362);
+
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + 0             + txi] = qs0;
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + MMQ_TILE_NE_K + txi] = qs1;
+    }
+
+    constexpr int scale_pairs_per_row = GGML_PQ3_K_SUBBLOCK_COUNT / 2;
+    constexpr int scale_rows_per_warp = warp_size / scale_pairs_per_row;
+    const int ksp = threadIdx.x % scale_pairs_per_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*scale_rows_per_warp) {
+        int i = i0 + threadIdx.y*scale_rows_per_warp + threadIdx.x/scale_pairs_per_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_pq3_K * bxi = (const block_pq3_K *) x + kbx0 + i*stride;
+        const int sub0 = 2*ksp;
+        const int band = sub0 / GGML_PQ3_K_SUBBLOCKS_PER_BAND;
+        const float dbase = __half2float(bxi->d[band]) * PQK_DP4A_INV_SCALE_3BIT;
+        const uint8_t qscale_pair = bxi->scales[ksp];
+        const half2 d_pair = make_half2(
+            dbase * PQ3_K_LOCAL_SCALE_LUT[qscale_pair & 0x0Fu],
+            dbase * PQ3_K_LOCAL_SCALE_LUT[qscale_pair >> 4]);
+
+        x_d[i*(MMQ_TILE_NE_K/2 + 1) + ksp] = d_pair;
+    }
+}
+
+template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_pq4_K(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_PQ4_K, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_d  = (half2 *) (x_qs + txs.qs);
+
+    constexpr int threads_per_row = 32;
+    constexpr int nrows = warp_size / threads_per_row;
+    const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
+        int i = i0 + (nrows == 1 ? threadIdx.y : threadIdx.y*nrows + threadIdx.x/threads_per_row);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_pq4_K * bxi = (const block_pq4_K *) x + kbx0 + i*stride;
+        const uint16_t * q16 = (const uint16_t *) bxi->qs;
+        const int2 vp = get_int_from_table_16((int) q16[txi] | ((int) q16[MMQ_TILE_NE_K + txi] << 16), PQK_DP4A_VAL_4BIT);
+        const int qs0 = __byte_perm(vp.x, vp.y, 0x5140);
+        const int qs1 = __byte_perm(vp.x, vp.y, 0x7362);
+
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + 0             + txi] = qs0;
+        x_qs[i*(2*MMQ_TILE_NE_K + 1) + MMQ_TILE_NE_K + txi] = qs1;
+    }
+
+    constexpr int scale_pairs_per_row = GGML_PQ4_K_SUBBLOCK_COUNT / 2;
+    constexpr int scale_rows_per_warp = warp_size / scale_pairs_per_row;
+    const int ksp = threadIdx.x % scale_pairs_per_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*scale_rows_per_warp) {
+        int i = i0 + threadIdx.y*scale_rows_per_warp + threadIdx.x/scale_pairs_per_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_pq4_K * bxi = (const block_pq4_K *) x + kbx0 + i*stride;
+        const int sub0 = 2*ksp;
+        const int band = sub0 / GGML_PQ4_K_SUBBLOCKS_PER_BAND;
+        const float dbase = __half2float(bxi->d[band]) * PQK_DP4A_INV_SCALE_4BIT;
+        const uint8_t qscale_pair = bxi->scales[ksp];
+        const half2 d_pair = make_half2(
+            dbase * PQ4_K_LOCAL_SCALE_LUT[qscale_pair & 0x0Fu],
+            dbase * PQ4_K_LOCAL_SCALE_LUT[qscale_pair >> 4]);
+
+        x_d[i*(MMQ_TILE_NE_K/2 + 1) + ksp] = d_pair;
+    }
+}
+
 template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_mxfp4(
     const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
     constexpr int nwarps = mmq_get_nwarps_device();
@@ -3946,17 +4074,17 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_PQ2_K> {
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_PQ3_K> {
     static constexpr int              vdr          = VDR_Q8_0_Q8_1_MMQ;
-    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_pq_K<GGML_TYPE_PQ3_K, mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
-    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_pq3_K<mmq_y, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_8_q8_1_mma<mmq_x, mmq_y>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_8_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
 template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_PQ4_K> {
     static constexpr int              vdr          = VDR_Q8_0_Q8_1_MMQ;
-    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_pq_K<GGML_TYPE_PQ4_K, mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
-    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_pq4_K<mmq_y, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_8_q8_1_mma<mmq_x, mmq_y>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_8_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
 template <int mmq_x, int mmq_y, bool need_check>
diff --git a/ggml/src/ggml-cuda/pq-tq-common.cuh b/ggml/src/ggml-cuda/pq-tq-common.cuh
@@ -309,6 +309,20 @@ __constant__ static const float PQ2_K_LOCAL_SCALE_LUT[16] = {
     0.5520447568f, 0.6729500963f, 0.820335356f, 1.0f
 };
 
+__constant__ static const float PQ3_K_LOCAL_SCALE_LUT[16] = {
+    0.0f, 0.0625f, 0.07618835339f, 0.09287464307f,
+    0.113215458f, 0.1380111892f, 0.1682375241f, 0.205083839f,
+    0.25f, 0.3047534136f, 0.3714985723f, 0.4528618321f,
+    0.5520447568f, 0.6729500963f, 0.820335356f, 1.0f
+};
+
+__constant__ static const float PQ4_K_LOCAL_SCALE_LUT[16] = {
+    0.0f, 0.0625f, 0.07618835339f, 0.09287464307f,
+    0.113215458f, 0.1380111892f, 0.1682375241f, 0.205083839f,
+    0.25f, 0.3047534136f, 0.3714985723f, 0.4528618321f,
+    0.5520447568f, 0.6729500963f, 0.820335356f, 1.0f
+};
+
 #undef PQK_DP4A_VAL4_ENTRY
 #undef PQK_DP4A_VAL3_ENTRY
 #undef PQK_DP4A_VAL2_ENTRY
diff --git a/ggml/src/ggml-cuda/pq-tq-dequant-wht.cuh b/ggml/src/ggml-cuda/pq-tq-dequant-wht.cuh
@@ -77,6 +77,18 @@ static __device__ __forceinline__ float pq2_k_dequant_scale(const block_pq2_K *
     return ggml_pq2_k_decode_local_scale(master, ggml_pq2_k_scale_get(x[ib].scales, subblock));
 }
 
+static __device__ __forceinline__ float pq3_k_dequant_scale(const block_pq3_K * x, const int ib, const int subblock) {
+    const int band = subblock / GGML_PQ3_K_SUBBLOCKS_PER_BAND;
+    const float master = __half2float(x[ib].d[band]);
+    return ggml_pq3_k_decode_local_scale(master, ggml_pq3_k_scale_get(x[ib].scales, subblock));
+}
+
+static __device__ __forceinline__ float pq4_k_dequant_scale(const block_pq4_K * x, const int ib, const int subblock) {
+    const int band = subblock / GGML_PQ4_K_SUBBLOCKS_PER_BAND;
+    const float master = __half2float(x[ib].d[band]);
+    return ggml_pq4_k_decode_local_scale(master, ggml_pq4_k_scale_get(x[ib].scales, subblock));
+}
+
 static __device__ __forceinline__ float pq_dequant_elem_2_k(const void * vx, int64_t global_elem) {
     const block_pq2_K * x = (const block_pq2_K *) vx;
     const int ib = global_elem / QK_K;
@@ -91,8 +103,8 @@ static __device__ __forceinline__ float pq_dequant_elem_3_k(const void * vx, int
     const block_pq3_K * x = (const block_pq3_K *) vx;
     const int ib = global_elem / QK_K;
     const int il = global_elem % QK_K;
-    const int subblock = il / GGML_PQK_SUBBLOCK_SIZE;
-    const float scale = pqk_dequant_scale(x, ib, subblock);
+    const int subblock = il / GGML_PQ3_K_SUBBLOCK_SIZE;
+    const float scale = pq3_k_dequant_scale(x, ib, subblock);
     const uint8_t ql = (x[ib].qs[il / 4] >> (2 * (il & 3))) & 0x3u;
     const uint8_t qh = (x[ib].hmask[il / 8] >> (il & 7)) & 0x1u;
     return ggml_pqk_centroid_3bit((uint8_t)(ql | (qh << 2))) * scale;
@@ -102,7 +114,7 @@ static __device__ __forceinline__ float pq_dequant_elem_4_k(const void * vx, int
     const block_pq4_K * x = (const block_pq4_K *) vx;
     const int ib = global_elem / QK_K;
     const int il = global_elem % QK_K;
-    const float scale = pqk_dequant_scale(x, ib, il / GGML_PQK_SUBBLOCK_SIZE);
+    const float scale = pq4_k_dequant_scale(x, ib, il / GGML_PQ4_K_SUBBLOCK_SIZE);
     const uint8_t q = (x[ib].qs[il / 2] >> (4 * (il & 1))) & 0xFu;
     return ggml_pqk_centroid_4bit(q) * scale;
 }
@@ -306,8 +318,8 @@ __device__ __forceinline__ float2 pq_tq_dequant_pair<PqTqTypeTag::P3_K>(const vo
     const int ib = global_pair / pairs_per_block;
     const int ip = global_pair % pairs_per_block;
     const int il = 2 * ip;
-    const int subblock = il / GGML_PQK_SUBBLOCK_SIZE;
-    const float scale = pqk_dequant_scale(x, ib, subblock);
+    const int subblock = il / GGML_PQ3_K_SUBBLOCK_SIZE;
+    const float scale = pq3_k_dequant_scale(x, ib, subblock);
     const uint8_t qb = x[ib].qs[il / 4];
     const int shift = 2 * (il & 3);
     const uint8_t q0 = ((qb >> shift) & 0x3u) | (((x[ib].hmask[il / 8] >> (il & 7)) & 0x1u) << 2);
@@ -323,7 +335,7 @@ __device__ __forceinline__ float2 pq_tq_dequant_pair<PqTqTypeTag::P4_K>(const vo
     const int ib = global_pair / pairs_per_block;
     const int ip = global_pair % pairs_per_block;
     const int il = 2 * ip;
-    const float scale = pqk_dequant_scale(x, ib, il / GGML_PQK_SUBBLOCK_SIZE);
+    const float scale = pq4_k_dequant_scale(x, ib, il / GGML_PQ4_K_SUBBLOCK_SIZE);
     const uint8_t qb = x[ib].qs[ip];
     return make_float2(ggml_pqk_centroid_4bit(qb & 0xFu) * scale, ggml_pqk_centroid_4bit(qb >> 4) * scale);
 }
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -973,17 +973,17 @@ static __device__ __forceinline__ float vec_dot_pq3_K_q8_1(
     const block_pq3_K * bq = (const block_pq3_K *) vbq + kbx;
 
     const int sub = iqs;
-    const int q8_block = sub >> 1;
-    const int q8_i32 = (sub & 1) * 4;
-    const int band = sub / PQK_MMVQ_SUBBLOCKS_PER_BAND;
-    const int elem_base = sub * PQK_MMVQ_SUBBLOCK_SIZE;
+    const int q8_block = sub >> 2;
+    const int q8_i32 = (sub & 3) * 2;
+    const int band = sub / GGML_PQ3_K_SUBBLOCKS_PER_BAND;
+    const int elem_base = sub * GGML_PQ3_K_SUBBLOCK_SIZE;
 
     int sumi = 0;
 #pragma unroll
-    for (int i = 0; i < 4; ++i) {
+    for (int i = 0; i < 2; ++i) {
         const int elem = elem_base + 4*i;
         const uint8_t high = (bq->hmask[elem >> 3] >> (elem & 7)) & 0x0Fu;
-        const uint8_t qb = bq->qs[4*sub + i];
+        const uint8_t qb = bq->qs[2*sub + i];
         const int q4 = ((((qb >> 0) & 0x03u) | ((high & 0x01u) << 2)) <<  0)
                      | ((((qb >> 2) & 0x03u) | ((high & 0x02u) << 1)) <<  4)
                      | ((((qb >> 4) & 0x03u) |  (high & 0x04u))       <<  8)
@@ -994,8 +994,8 @@ static __device__ __forceinline__ float vec_dot_pq3_K_q8_1(
         sumi = ggml_cuda_dp4a(v, u, sumi);
     }
 
-    const uint8_t qscale = pqk_vec_scale_get(bq->scales, sub);
-    const float d = __half2float(bq->d[band]) * PQK_LOCAL_SCALE_LUT[qscale] * PQK_DP4A_INV_SCALE_3BIT;
+    const uint8_t qscale = ggml_pq3_k_scale_get(bq->scales, sub);
+    const float d = __half2float(bq->d[band]) * PQ3_K_LOCAL_SCALE_LUT[qscale] * PQK_DP4A_INV_SCALE_3BIT;
     return d * __low2float(bq8_1[q8_block].ds) * sumi;
 }
 
@@ -1005,22 +1005,22 @@ static __device__ __forceinline__ float vec_dot_pq4_K_q8_1(
     const block_pq4_K * bq = (const block_pq4_K *) vbq + kbx;
 
     const int sub = iqs;
-    const int q8_block = sub >> 1;
-    const int q8_i32 = (sub & 1) * 4;
-    const int band = sub / PQK_MMVQ_SUBBLOCKS_PER_BAND;
+    const int q8_block = sub >> 2;
+    const int q8_i32 = (sub & 3) * 2;
+    const int band = sub / GGML_PQ4_K_SUBBLOCKS_PER_BAND;
     const uint16_t * q16 = (const uint16_t *) bq->qs;
 
     int sumi = 0;
 #pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        const int2 vp = get_int_from_table_16((int) q16[4*sub + i], PQK_DP4A_VAL_4BIT);
+    for (int i = 0; i < 2; ++i) {
+        const int2 vp = get_int_from_table_16((int) q16[2*sub + i], PQK_DP4A_VAL_4BIT);
         const int v = __byte_perm(vp.x, vp.y, 0x5140);
         const int u = get_int_b4(bq8_1[q8_block].qs, q8_i32 + i);
         sumi = ggml_cuda_dp4a(v, u, sumi);
     }
 
-    const uint8_t qscale = pqk_vec_scale_get(bq->scales, sub);
-    const float d = __half2float(bq->d[band]) * PQK_LOCAL_SCALE_LUT[qscale] * PQK_DP4A_INV_SCALE_4BIT;
+    const uint8_t qscale = ggml_pq4_k_scale_get(bq->scales, sub);
+    const float d = __half2float(bq->d[band]) * PQ4_K_LOCAL_SCALE_LUT[qscale] * PQK_DP4A_INV_SCALE_4BIT;
     return d * __low2float(bq8_1[q8_block].ds) * sumi;
 }
 
diff --git a/ggml/src/ggml-pq-tq.c b/ggml/src/ggml-pq-tq.c
diff --git a/ggml/src/ggml-pqk-common.h b/ggml/src/ggml-pqk-common.h