Skip to content

Commit f5dda72

Browse files
Merge pull request #8 from PrismML-Eng/cpu-fixes
some cpu fixes; getting ready for upstream PR; e.g. id 40 is taken by…
2 parents 1179bfc + 7cfedd0 commit f5dda72

6 files changed

Lines changed: 82 additions & 196 deletions

File tree

ggml/include/ggml.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,10 @@ extern "C" {
427427
// GGML_TYPE_IQ4_NL_4_8 = 37,
428428
// GGML_TYPE_IQ4_NL_8_8 = 38,
429429
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
430-
GGML_TYPE_Q1_0 = 40,
430+
// 40 is GGML_TYPE_NVFP4 in upstream llama.cpp
431431
GGML_TYPE_Q1_0_g128 = 41,
432-
GGML_TYPE_COUNT = 42,
432+
GGML_TYPE_Q1_0 = 42,
433+
GGML_TYPE_COUNT = 43,
433434
};
434435

435436
// precision
@@ -465,8 +466,8 @@ extern "C" {
465466
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
466467
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
467468
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
468-
GGML_FTYPE_MOSTLY_Q1_0 = 26, // except 1d tensors
469469
GGML_FTYPE_MOSTLY_Q1_0_g128 = 27, // except 1d tensors
470+
GGML_FTYPE_MOSTLY_Q1_0 = 28, // except 1d tensors
470471
};
471472

472473
// available tensor operations:

ggml/src/ggml-cpu/arch/arm/quants.c

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -137,30 +137,70 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
137137

138138
//===================================== Dot products =================================
139139

140-
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
141-
// For nrc > 1, call generic multiple times
142-
if (nrc == 1) {
143-
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
144-
} else {
145-
// Handle multiple rows by calling generic for each
146-
const int qk = QK8_0;
147-
const int nb = n / qk;
148-
const size_t x_size = nb * sizeof(block_q1_0);
149-
const size_t y_size = nb * sizeof(block_q8_0);
150-
151-
for (int i = 0; i < nrc; i++) {
152-
ggml_vec_dot_q1_0_q8_0_generic(
153-
n,
154-
s + i,
155-
bs,
156-
(const char *)vx + i * x_size,
157-
bx,
158-
(const char *)vy + i * y_size,
159-
by,
160-
1
161-
);
162-
}
140+
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
141+
const int qk = QK1_0;
142+
const int nb = n / qk;
143+
144+
assert(n % qk == 0);
145+
assert(nrc == 1);
146+
UNUSED(nrc);
147+
UNUSED(bx);
148+
UNUSED(by);
149+
UNUSED(bs);
150+
151+
const block_q1_0 * GGML_RESTRICT x = vx;
152+
const block_q8_0 * GGML_RESTRICT y = vy;
153+
154+
float sumf = 0.0f;
155+
156+
#if defined(__ARM_NEON)
157+
float32x4_t sumv = vdupq_n_f32(0.0f);
158+
159+
for (int i = 0; i < nb; i++) {
160+
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
161+
const float d1 = GGML_CPU_FP16_TO_FP32(y[i].d);
162+
163+
const uint8_t * bits = x[i].qs;
164+
165+
const int8x16_t y0 = vld1q_s8(y[i].qs);
166+
const int8x16_t y1 = vld1q_s8(y[i].qs + 16);
167+
168+
const uint64_t expand0 = table_b2b_0[bits[0]];
169+
const uint64_t expand1 = table_b2b_0[bits[1]];
170+
const uint64_t expand2 = table_b2b_0[bits[2]];
171+
const uint64_t expand3 = table_b2b_0[bits[3]];
172+
173+
uint8x8_t e0 = vcreate_u8(expand0);
174+
uint8x8_t e1 = vcreate_u8(expand1);
175+
uint8x8_t e2 = vcreate_u8(expand2);
176+
uint8x8_t e3 = vcreate_u8(expand3);
177+
178+
int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
179+
int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
180+
int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
181+
int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
182+
183+
int8x8_t one = vdup_n_s8(1);
184+
s0 = vsub_s8(vadd_s8(s0, s0), one);
185+
s1 = vsub_s8(vadd_s8(s1, s1), one);
186+
s2 = vsub_s8(vadd_s8(s2, s2), one);
187+
s3 = vsub_s8(vadd_s8(s3, s3), one);
188+
189+
int8x16_t signs0 = vcombine_s8(s0, s1);
190+
int8x16_t signs1 = vcombine_s8(s2, s3);
191+
192+
int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
193+
int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
194+
195+
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
163196
}
197+
198+
sumf = vaddvq_f32(sumv);
199+
#else
200+
ggml_vec_dot_q1_0_q8_0_generic(n, &sumf, bs, vx, bx, vy, by, 1);
201+
#endif
202+
203+
*s = sumf;
164204
}
165205

166206
void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {

ggml/src/ggml-cpu/arch/x86/quants.c

Lines changed: 2 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -541,162 +541,11 @@ static inline __m128i get_scale_shuffle(int i) {
541541
#endif
542542

543543
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
544-
const int qk = QK8_0;
545-
const int nb = n / qk;
546-
547-
assert(n % qk == 0);
548-
assert(nrc == 1);
549-
UNUSED(nrc);
550-
UNUSED(bx);
551-
UNUSED(by);
552-
UNUSED(bs);
553-
554-
const block_q1_0 * GGML_RESTRICT x = vx;
555-
const block_q8_0 * GGML_RESTRICT y = vy;
556-
557-
int ib = 0;
558-
float sumf = 0;
559-
560-
#if defined(__AVX2__)
561-
// Initialize accumulator with zeros
562-
__m256 acc = _mm256_setzero_ps();
563-
564-
// Main loop - compute dot product for each block
565-
for (; ib < nb; ++ib) {
566-
// Compute combined scale for the block
567-
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
568-
569-
// Load Q1_0 bits (4 bytes = 32 bits)
570-
const uint32_t qbits32 = *(const uint32_t *)x[ib].qs;
571-
572-
// Load Q8_0 values (32 bytes)
573-
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
574-
575-
// Expand 32 bits to 32 bytes (each bit becomes ±1)
576-
// We need to place the right byte in each 8-byte group and mask the right bit
577-
__m256i qx;
578-
{
579-
// Create a vector with each of the 4 bytes replicated 8 times
580-
// Byte 0 in positions 0-7, byte 1 in positions 8-15, byte 2 in positions 16-23, byte 3 in positions 24-31
581-
const __m256i shuffle_mask = _mm256_set_epi8(
582-
3, 3, 3, 3, 3, 3, 3, 3, // byte 3 (bits 24-31) replicated
583-
2, 2, 2, 2, 2, 2, 2, 2, // byte 2 (bits 16-23) replicated
584-
1, 1, 1, 1, 1, 1, 1, 1, // byte 1 (bits 8-15) replicated
585-
0, 0, 0, 0, 0, 0, 0, 0 // byte 0 (bits 0-7) replicated
586-
);
587-
588-
// Broadcast the 4 bytes across the 128-bit lanes
589-
const __m128i qbits_128 = _mm_set1_epi32(qbits32);
590-
const __m256i qbits_256 = _mm256_broadcastsi128_si256(qbits_128);
591-
592-
// Shuffle to replicate bytes
593-
const __m256i qbits_shuffled = _mm256_shuffle_epi8(qbits_256, shuffle_mask);
594-
595-
// Create bit masks for each position within a byte
596-
const __m256i bit_mask = _mm256_set_epi8(
597-
(char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, // masks for byte 3
598-
(char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, // masks for byte 2
599-
(char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01, // masks for byte 1
600-
(char)0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 // masks for byte 0
601-
);
602-
603-
// Test each bit: AND with mask, compare to mask
604-
// Result is 0xFF if bit is set, 0x00 if not
605-
const __m256i bit_test = _mm256_and_si256(qbits_shuffled, bit_mask);
606-
const __m256i is_set = _mm256_cmpeq_epi8(bit_test, bit_mask);
607-
608-
// Convert 0xFF -> +1, 0x00 -> -1
609-
// is_set is 0xFF (all bits set) if bit is 1, or 0x00 if bit is 0
610-
// We want: +1 if bit is 1, -1 if bit is 0
611-
// Method: (is_set & 1) gives 1 or 0, then (value << 1) - 1 gives +1 or -1
612-
const __m256i ones = _mm256_set1_epi8(1);
613-
const __m256i bit_value = _mm256_and_si256(is_set, ones); // 0x01 or 0x00
614-
const __m256i bit_doubled = _mm256_add_epi8(bit_value, bit_value); // 0x02 or 0x00
615-
qx = _mm256_sub_epi8(bit_doubled, ones); // 0x01 or 0xFF (-1)
616-
}
617-
618-
// Multiply and accumulate using the same pattern as Q4_0
619-
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
620-
621-
// Multiply q with scale and accumulate
622-
acc = _mm256_fmadd_ps(d, q, acc);
623-
}
624-
625-
sumf = hsum_float_8(acc);
626-
627-
#endif
628-
// Fallback scalar loop for remaining blocks
629-
for (; ib < nb; ++ib) {
630-
const uint8_t* qbits = x[ib].qs;
631-
int sumi = 0;
632-
633-
// Optimized scalar processing for QK1_0 bits
634-
for (int byte_idx = 0; byte_idx < QK1_0/8; ++byte_idx) {
635-
const uint8_t bits8 = qbits[byte_idx];
636-
const int base_idx = byte_idx * 8;
637-
638-
// Process each bit
639-
for (int bit_idx = 0; bit_idx < 8; ++bit_idx) {
640-
const int xi = (bits8 & (1U << bit_idx)) ? 1 : -1;
641-
sumi += xi * y[ib].qs[base_idx + bit_idx];
642-
}
643-
}
644-
645-
sumf += sumi * GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
646-
}
647-
648-
*s = sumf;
544+
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
649545
}
650546

651547
void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
652-
const int qk = QK1_0_g128;
653-
const int nb = n / qk;
654-
655-
assert(n % qk == 0);
656-
assert(nrc == 1);
657-
UNUSED(nrc);
658-
UNUSED(bx);
659-
UNUSED(by);
660-
UNUSED(bs);
661-
662-
const block_q1_0_g128 * GGML_RESTRICT x = vx;
663-
const block_q8_0 * GGML_RESTRICT y = vy;
664-
665-
float sumf = 0;
666-
667-
// Each Q1_0_g128 block has 128 elements
668-
// Each Q8_0 block has 32 elements
669-
// So we need 4 Q8_0 blocks per Q1_0_g128 block
670-
for (int ib = 0; ib < nb; ++ib) {
671-
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
672-
673-
int sumi = 0;
674-
675-
// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
676-
for (int k = 0; k < 4; k++) {
677-
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib*4 + k].d);
678-
679-
int sumi_block = 0;
680-
681-
for (int j = 0; j < QK8_0; j++) {
682-
const int bit_index = k * QK8_0 + j;
683-
const int byte_index = bit_index / 8;
684-
const int bit_offset = bit_index % 8;
685-
686-
// Extract bit: 1 = +1, 0 = -1
687-
const int xi = ((x[ib].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
688-
const int yi = y[ib*4 + k].qs[j];
689-
690-
sumi_block += xi * yi;
691-
}
692-
693-
sumi += d1 * sumi_block;
694-
}
695-
696-
sumf += d0 * sumi;
697-
}
698-
699-
*s = sumf;
548+
ggml_vec_dot_q1_0_g128_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
700549
}
701550

702551
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {

ggml/src/ggml-cpu/quants.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -183,29 +183,25 @@ void ggml_vec_dot_q1_0_g128_q8_0_generic(int n, float * GGML_RESTRICT s, size_t
183183
for (int i = 0; i < nb; i++) {
184184
const float d0 = GGML_FP16_TO_FP32(x[i].d);
185185

186-
int sumi = 0;
187-
188-
// Process 4 Q8_0 blocks (4 * 32 = 128 elements)
186+
float sumi = 0.0f;
187+
189188
for (int k = 0; k < 4; k++) {
190189
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
191-
190+
192191
int sumi_block = 0;
193-
192+
194193
for (int j = 0; j < QK8_0; j++) {
195194
const int bit_index = k * QK8_0 + j;
196195
const int byte_index = bit_index / 8;
197196
const int bit_offset = bit_index % 8;
198-
199-
// Extract bit: 1 = +1, 0 = -1
197+
200198
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
201-
const int yi = y[i*4 + k].qs[j];
202-
203-
sumi_block += xi * yi;
199+
sumi_block += xi * y[i*4 + k].qs[j];
204200
}
205-
201+
206202
sumi += d1 * sumi_block;
207203
}
208-
204+
209205
sumf += d0 * sumi;
210206
}
211207

gguf-py/gguf/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3773,8 +3773,8 @@ class GGMLQuantizationType(IntEnum):
37733773
TQ1_0 = 34
37743774
TQ2_0 = 35
37753775
MXFP4 = 39
3776-
Q1_0 = 40
37773776
Q1_0_g128 = 41
3777+
Q1_0 = 42
37783778

37793779

37803780
class ExpertGatingFuncType(IntEnum):
@@ -3827,8 +3827,8 @@ class LlamaFileType(IntEnum):
38273827
MOSTLY_TQ1_0 = 36 # except 1d tensors
38283828
MOSTLY_TQ2_0 = 37 # except 1d tensors
38293829
MOSTLY_MXFP4_MOE = 38 # except 1d tensors
3830-
MOSTLY_Q1_0 = 40 # except 1d tensors
3831-
MOSTLY_Q1_0_g128 = 41 # except 1d tensors
3830+
MOSTLY_Q1_0_g128 = 40 # except 1d tensors
3831+
MOSTLY_Q1_0 = 41 # except 1d tensors
38323832

38333833
GUESSED = 1024 # not specified in the model file
38343834

include/llama.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,8 @@ extern "C" {
152152
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
153153
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
154154
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155-
LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors
156-
LLAMA_FTYPE_MOSTLY_Q1_0_g128 = 41, // except 1d tensors
155+
LLAMA_FTYPE_MOSTLY_Q1_0_g128 = 40, // except 1d tensors
156+
LLAMA_FTYPE_MOSTLY_Q1_0 = 41, // except 1d tensors
157157

158158
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
159159
};

0 commit comments

Comments
 (0)