Skip to content

Commit 2e1f0a8

Browse files
khosravipashaCISC
andauthored
ggml: add Q1_0 1-bit quantization support (CPU) (ggml-org#21273)
* ggml: add Q1_0 and Q1_0_g128 1-bit quantization support (CPU) * add generic fallback for x86 * remove Q1_0 (group size 32) * rename Q1_0_g128 => Q1_0 * fix Q1_0 LlamaFileType Enum * Fix trailing spaces; add generic fallback for othre backends * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix /r/n spacing + arch-fallback --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent 506200c commit 2e1f0a8

File tree

21 files changed

+285
-5
lines changed

21 files changed

+285
-5
lines changed

ggml/include/ggml.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,8 @@ extern "C" {
428428
// GGML_TYPE_IQ4_NL_8_8 = 38,
429429
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
430430
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
431-
GGML_TYPE_COUNT = 41,
431+
GGML_TYPE_Q1_0 = 41,
432+
GGML_TYPE_COUNT = 42,
432433
};
433434

434435
// precision
@@ -465,6 +466,7 @@ extern "C" {
465466
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
466467
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
467468
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
469+
GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors
468470
};
469471

470472
// available tensor operations:

ggml/src/ggml-common.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ typedef sycl::half2 ggml_half2;
9393
// QR = QK / number of values before dequantization
9494
// QI = number of 32 bit integers before dequantization
9595

96+
#define QI1_0 (QK1_0 / 32)
97+
#define QR1_0 1
98+
99+
96100
#define QI4_0 (QK4_0 / (4 * QR4_0))
97101
#define QR4_0 2
98102

@@ -170,6 +174,13 @@ typedef sycl::half2 ggml_half2;
170174
#define GGML_EXTENSION __extension__
171175
#endif // _MSC_VER
172176

177+
#define QK1_0 128
178+
typedef struct {
179+
ggml_half d; // delta
180+
uint8_t qs[QK1_0 / 8]; // bits / quants
181+
} block_q1_0;
182+
static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0 / 8, "wrong q1_0 block size/padding");
183+
173184
#define QK4_0 32
174185
typedef struct {
175186
ggml_half d; // delta

ggml/src/ggml-cpu/arch-fallback.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
1717
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
1818
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
19+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
1920
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
2021
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
2122
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -82,6 +83,7 @@
8283
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
8384
// quants.c
8485
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
86+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
8587
// repack.cpp
8688
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
8789
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -112,6 +114,7 @@
112114
// quants.c
113115
#define quantize_row_q8_K_generic quantize_row_q8_K
114116
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
117+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
115118
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
116119
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
117120
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
@@ -160,6 +163,7 @@
160163
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
161164
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
162165
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
166+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
163167
// repack.cpp
164168
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
165169
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -200,6 +204,7 @@
200204
#elif defined(__riscv)
201205
// quants.c
202206
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
207+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
203208
// repack.cpp
204209
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
205210
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@@ -240,6 +245,7 @@
240245
// quants.c
241246
#define quantize_row_q8_K_generic quantize_row_q8_K
242247
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
248+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
243249
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
244250
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
245251
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -303,6 +309,7 @@
303309
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
304310
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
305311
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
312+
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
306313
// repack.cpp
307314
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
308315
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

ggml/src/ggml-cpu/arch/arm/quants.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,109 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
137137

138138
//===================================== Dot products =================================
139139

140+
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
141+
const int qk = QK1_0; // 128
142+
const int nb = n / qk;
143+
144+
assert(n % qk == 0);
145+
assert(nrc == 1);
146+
UNUSED(nrc);
147+
UNUSED(bx);
148+
UNUSED(by);
149+
UNUSED(bs);
150+
151+
const block_q1_0 * GGML_RESTRICT x = vx;
152+
const block_q8_0 * GGML_RESTRICT y = vy;
153+
154+
float sumf = 0.0f;
155+
156+
#if defined(__ARM_NEON)
157+
float32x4_t sumv = vdupq_n_f32(0.0f);
158+
159+
for (int i = 0; i < nb; i++) {
160+
const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d);
161+
162+
// Process 4 Q8_0 blocks (each has 32 elements)
163+
for (int k = 0; k < 4; k++) {
164+
const block_q8_0 * GGML_RESTRICT yb = &y[i * 4 + k];
165+
const float d1 = GGML_CPU_FP16_TO_FP32(yb->d);
166+
167+
// Get the 4 bytes of bits for this Q8_0 block (32 bits = 4 bytes)
168+
// Bits are at offset k*4 bytes in x[i].qs
169+
const uint8_t * bits = &x[i].qs[k * 4];
170+
171+
// Load 32 int8 values from y
172+
const int8x16_t y0 = vld1q_s8(yb->qs);
173+
const int8x16_t y1 = vld1q_s8(yb->qs + 16);
174+
175+
// Byte 0-1: bits for y0[0..15]
176+
const uint64_t expand0 = table_b2b_0[bits[0]];
177+
const uint64_t expand1 = table_b2b_0[bits[1]];
178+
// Byte 2-3: bits for y1[0..15]
179+
const uint64_t expand2 = table_b2b_0[bits[2]];
180+
const uint64_t expand3 = table_b2b_0[bits[3]];
181+
182+
// Build the sign vectors by reinterpreting the table values
183+
uint8x8_t e0 = vcreate_u8(expand0);
184+
uint8x8_t e1 = vcreate_u8(expand1);
185+
uint8x8_t e2 = vcreate_u8(expand2);
186+
uint8x8_t e3 = vcreate_u8(expand3);
187+
188+
// Shift right by 4 to get 0 or 1
189+
int8x8_t s0 = vreinterpret_s8_u8(vshr_n_u8(e0, 4));
190+
int8x8_t s1 = vreinterpret_s8_u8(vshr_n_u8(e1, 4));
191+
int8x8_t s2 = vreinterpret_s8_u8(vshr_n_u8(e2, 4));
192+
int8x8_t s3 = vreinterpret_s8_u8(vshr_n_u8(e3, 4));
193+
194+
// Convert 0/1 to -1/+1: sign = 2*val - 1
195+
int8x8_t one = vdup_n_s8(1);
196+
s0 = vsub_s8(vadd_s8(s0, s0), one); // 2*s0 - 1
197+
s1 = vsub_s8(vadd_s8(s1, s1), one);
198+
s2 = vsub_s8(vadd_s8(s2, s2), one);
199+
s3 = vsub_s8(vadd_s8(s3, s3), one);
200+
201+
// Combine into 16-element vectors
202+
int8x16_t signs0 = vcombine_s8(s0, s1);
203+
int8x16_t signs1 = vcombine_s8(s2, s3);
204+
205+
// Multiply signs with y values and accumulate
206+
// dot(signs, y) where signs are +1/-1
207+
int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), signs0, y0);
208+
int32x4_t p1 = ggml_vdotq_s32(p0, signs1, y1);
209+
210+
// Scale by d1 and accumulate
211+
sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1);
212+
}
213+
}
214+
215+
sumf = vaddvq_f32(sumv);
216+
#else
217+
// Scalar fallback
218+
for (int i = 0; i < nb; i++) {
219+
const float d0 = GGML_FP16_TO_FP32(x[i].d);
220+
221+
// Process 4 Q8_0 blocks
222+
for (int k = 0; k < 4; k++) {
223+
const float d1 = GGML_FP16_TO_FP32(y[i*4 + k].d);
224+
225+
int sumi = 0;
226+
for (int j = 0; j < QK8_0; j++) {
227+
const int bit_index = k * QK8_0 + j;
228+
const int byte_index = bit_index / 8;
229+
const int bit_offset = bit_index % 8;
230+
231+
const int xi = ((x[i].qs[byte_index] >> bit_offset) & 1) ? 1 : -1;
232+
sumi += xi * y[i*4 + k].qs[j];
233+
}
234+
sumf += d0 * d1 * sumi;
235+
}
236+
}
237+
#endif
238+
239+
*s = sumf;
240+
}
241+
242+
140243
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
141244
const int qk = QK8_0;
142245
const int nb = n / qk;

ggml/src/ggml-cpu/arch/loongarch/quants.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2156,4 +2156,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
21562156
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
21572157
#endif
21582158
}
2159-

ggml/src/ggml-cpu/arch/powerpc/quants.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2302,4 +2302,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
23022302
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
23032303
#endif
23042304
}
2305-

ggml/src/ggml-cpu/arch/s390/quants.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1463,4 +1463,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
14631463
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
14641464
#endif
14651465
}
1466-

ggml/src/ggml-cpu/arch/wasm/quants.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1218,4 +1218,3 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
12181218
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
12191219
#endif
12201220
}
1221-

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
217217
.vec_dot_type = GGML_TYPE_F16,
218218
.nrows = 1,
219219
},
220+
[GGML_TYPE_Q1_0] = {
221+
.from_float = quantize_row_q1_0,
222+
.vec_dot = ggml_vec_dot_q1_0_q8_0,
223+
.vec_dot_type = GGML_TYPE_Q8_0,
224+
.nrows = 1,
225+
},
220226
[GGML_TYPE_Q4_0] = {
221227
.from_float = quantize_row_q4_0,
222228
.vec_dot = ggml_vec_dot_q4_0_q8_0,

ggml/src/ggml-cpu/ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4829,6 +4829,7 @@ void ggml_compute_forward_get_rows(
48294829
const ggml_tensor * src0 = dst->src[0];
48304830

48314831
switch (src0->type) {
4832+
case GGML_TYPE_Q1_0:
48324833
case GGML_TYPE_Q4_0:
48334834
case GGML_TYPE_Q4_1:
48344835
case GGML_TYPE_Q5_0:
@@ -5554,6 +5555,7 @@ void ggml_compute_forward_clamp(
55545555
ggml_compute_forward_clamp_f16(params, dst);
55555556
} break;
55565557
case GGML_TYPE_BF16:
5558+
case GGML_TYPE_Q1_0:
55575559
case GGML_TYPE_Q4_0:
55585560
case GGML_TYPE_Q4_1:
55595561
case GGML_TYPE_Q5_0:

0 commit comments

Comments
 (0)