Skip to content

Commit ba41d84

Browse files
committed
Add turbo4_0 (4-bit PolarQuant) + fix llama-bench segfault
- New GGML_TYPE_TURBO4_0: 4-bit Lloyd-Max centroids, block-32, group-128, WHT rotation - 4-mag LUT optimization for pre-M5 Metal (halves constant addresses, +38% decode) - Softmax exp() skip when s-M < -20, O rescaling skip when max unchanged - Fix supports_op to only allow cross-type FA combos with kernel instantiations - Add GGML_ASSERT for null pipelines in FA dispatch - CPU quantize/dequantize + Metal dequant/FA/set_rows/get_rows for turbo4_0 - llama-bench: add turbo4_0 + all turbo/rq type name mappings
1 parent 50923d8 commit ba41d84

16 files changed

Lines changed: 1285 additions & 25 deletions

File tree

common/arg.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,9 @@ const std::vector<ggml_type> kv_cache_types = {
395395
GGML_TYPE_RQ4_1,
396396
GGML_TYPE_RQ5_1,
397397
GGML_TYPE_RQ6_1,
398+
GGML_TYPE_TURBO3_0,
399+
GGML_TYPE_TURBO2_0,
400+
GGML_TYPE_TURBO4_0,
398401
};
399402

400403
static ggml_type kv_cache_type_from_str(const std::string & s) {

ggml/include/ggml.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,10 @@ extern "C" {
436436
GGML_TYPE_RQ4_1 = 46, // RotorQuant 3-bit dk64: Clifford rotor + Lloyd-Max
437437
GGML_TYPE_RQ5_1 = 47, // RotorQuant 4-bit dk64: Clifford rotor + Lloyd-Max
438438
GGML_TYPE_RQ6_1 = 48, // RotorQuant 5-bit dk64: Clifford rotor + Lloyd-Max
439-
GGML_TYPE_COUNT = 49,
439+
GGML_TYPE_TURBO3_0 = 49, // TurboQuant 3-bit WHT (32-element blocks)
440+
GGML_TYPE_TURBO2_0 = 50, // TurboQuant 2-bit WHT (32-element blocks)
441+
GGML_TYPE_TURBO4_0 = 51, // TurboQuant 4-bit WHT pure PolarQuant (32-element blocks, 16 centroids)
442+
GGML_TYPE_COUNT = 52,
440443
};
441444

442445
// precision

ggml/src/ggml-common.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,33 @@ static_assert(sizeof(block_rq5_1) == sizeof(ggml_half) + 32, "wrong rq5_1 block
270270
typedef struct { ggml_half norm; uint8_t qs[40]; } block_rq6_1;
271271
static_assert(sizeof(block_rq6_1) == sizeof(ggml_half) + 40, "wrong rq6_1 block size");
272272

273+
// TurboQuant WHT-rotated types (block size 32, rotation group 128)
274+
#define QK_TURBO3_0 32
275+
#define QK_TURBO3_0_GROUP 128
276+
typedef struct {
277+
ggml_half norm;
278+
uint8_t qs[QK_TURBO3_0 / 4]; // 8 bytes: lower 2-bit
279+
uint8_t signs[QK_TURBO3_0 / 8]; // 4 bytes: upper 1-bit
280+
} block_turbo3_0; // 14 bytes
281+
static_assert(sizeof(block_turbo3_0) == sizeof(ggml_half) + QK_TURBO3_0/4 + QK_TURBO3_0/8, "wrong turbo3_0 block size");
282+
283+
#define QK_TURBO2_0 32
284+
#define QK_TURBO2_0_GROUP 128
285+
typedef struct {
286+
ggml_half norm;
287+
uint8_t qs[QK_TURBO2_0 / 4]; // 8 bytes: 2-bit indices
288+
} block_turbo2_0; // 10 bytes
289+
static_assert(sizeof(block_turbo2_0) == sizeof(ggml_half) + QK_TURBO2_0/4, "wrong turbo2_0 block size");
290+
291+
// TurboQuant 4-bit WHT-rotated (pure PolarQuant, 16 centroids, no QJL)
292+
#define QK_TURBO4_0 32
293+
#define QK_TURBO4_0_GROUP 128
294+
typedef struct {
295+
ggml_half norm;
296+
uint8_t qs[QK_TURBO4_0 / 2]; // 16 bytes: 4-bit indices packed 2 per byte
297+
} block_turbo4_0; // 18 bytes
298+
static_assert(sizeof(block_turbo4_0) == sizeof(ggml_half) + QK_TURBO4_0/2, "wrong turbo4_0 block size");
299+
273300
//
274301
// Ternary quantization
275302
//

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,24 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
438438
.vec_dot_type = GGML_TYPE_Q8_0,
439439
.nrows = 1,
440440
},
441+
[GGML_TYPE_TURBO3_0] = {
442+
.from_float = quantize_row_turbo3_0,
443+
.vec_dot = ggml_vec_dot_turbo3_0_q8_0,
444+
.vec_dot_type = GGML_TYPE_Q8_0,
445+
.nrows = 1,
446+
},
447+
[GGML_TYPE_TURBO2_0] = {
448+
.from_float = quantize_row_turbo2_0,
449+
.vec_dot = ggml_vec_dot_turbo2_0_q8_0,
450+
.vec_dot_type = GGML_TYPE_Q8_0,
451+
.nrows = 1,
452+
},
453+
[GGML_TYPE_TURBO4_0] = {
454+
.from_float = quantize_row_turbo4_0,
455+
.vec_dot = ggml_vec_dot_turbo4_0_q8_0,
456+
.vec_dot_type = GGML_TYPE_Q8_0,
457+
.nrows = 1,
458+
},
441459
[GGML_TYPE_I32] = {
442460
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
443461
},

ggml/src/ggml-cpu/quants.c

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,3 +1435,74 @@ void ggml_vec_dot_rq6_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
14351435
}
14361436
*s = sumf;
14371437
}
1438+
1439+
// ============================ TurboQuant WHT wrappers (turbo3_0, turbo2_0)
1440+
1441+
void quantize_row_turbo3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1442+
quantize_row_turbo3_0_ref(x, y, k);
1443+
}
1444+
1445+
void quantize_row_turbo2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1446+
quantize_row_turbo2_0_ref(x, y, k);
1447+
}
1448+
1449+
void ggml_vec_dot_turbo3_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1450+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
1451+
assert(nrc == 1);
1452+
const int nb = n / QK_TURBO3_0;
1453+
float sumf = 0.0f;
1454+
float tmp_x[QK_TURBO3_0];
1455+
float tmp_y[QK8_0];
1456+
const block_turbo3_0 * GGML_RESTRICT x = (const block_turbo3_0 *)vx;
1457+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
1458+
for (int i = 0; i < nb; i++) {
1459+
dequantize_row_turbo3_0(&x[i], tmp_x, QK_TURBO3_0);
1460+
dequantize_row_q8_0(&y[i], tmp_y, QK8_0);
1461+
for (int k = 0; k < QK8_0; k++) {
1462+
sumf += tmp_x[k] * tmp_y[k];
1463+
}
1464+
}
1465+
*s = sumf;
1466+
}
1467+
1468+
void ggml_vec_dot_turbo2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1469+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
1470+
assert(nrc == 1);
1471+
const int nb = n / QK_TURBO2_0;
1472+
float sumf = 0.0f;
1473+
float tmp_x[QK_TURBO2_0];
1474+
float tmp_y[QK8_0];
1475+
const block_turbo2_0 * GGML_RESTRICT x = (const block_turbo2_0 *)vx;
1476+
const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 *)vy;
1477+
for (int i = 0; i < nb; i++) {
1478+
dequantize_row_turbo2_0(&x[i], tmp_x, QK_TURBO2_0);
1479+
dequantize_row_q8_0(&y[i], tmp_y, QK8_0);
1480+
for (int k = 0; k < QK8_0; k++) {
1481+
sumf += tmp_x[k] * tmp_y[k];
1482+
}
1483+
}
1484+
*s = sumf;
1485+
}
1486+
1487+
void quantize_row_turbo4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
1488+
quantize_row_turbo4_0_ref(x, y, k);
1489+
}
1490+
1491+
void ggml_vec_dot_turbo4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1492+
GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
1493+
assert(nrc == 1);
1494+
const int nb = n / QK_TURBO4_0;
1495+
float sumf = 0.0f;
1496+
float tmp_x[QK_TURBO4_0];
1497+
float tmp_y[QK8_0];
1498+
const block_turbo4_0 * GGML_RESTRICT x_b = (const block_turbo4_0 *)vx;
1499+
const block_q8_0 * GGML_RESTRICT y_b = (const block_q8_0 *)vy;
1500+
for (int i = 0; i < nb; i++) {
1501+
dequantize_row_turbo4_0(&x_b[i], tmp_x, QK_TURBO4_0);
1502+
dequantize_row_q8_0(&y_b[i], tmp_y, QK8_0);
1503+
for (int k = 0; k < QK8_0; k++) {
1504+
sumf += tmp_x[k] * tmp_y[k];
1505+
}
1506+
}
1507+
*s = sumf;
1508+
}

ggml/src/ggml-cpu/quants.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ void quantize_row_turbo4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y
3737
void quantize_row_turbo5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
3838
void quantize_row_turbo6_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
3939

40+
void quantize_row_turbo3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
41+
void quantize_row_turbo2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
42+
void quantize_row_turbo4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
43+
4044
void quantize_row_rq3_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4145
void quantize_row_rq4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
4246
void quantize_row_rq5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -74,6 +78,10 @@ void ggml_vec_dot_rq4_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
7478
void ggml_vec_dot_rq5_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7579
void ggml_vec_dot_rq6_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7680

81+
void ggml_vec_dot_turbo3_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
82+
void ggml_vec_dot_turbo2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
83+
void ggml_vec_dot_turbo4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
84+
7785
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7886
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
7987
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,10 +1157,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
11571157
return false;
11581158
}
11591159
if (op->src[1]->type != op->src[2]->type) {
1160-
// Allow mixed turbo/rq types (auto-asymmetric K/V)
1161-
const bool k_is_turbo = (op->src[1]->type >= GGML_TYPE_TURBO3_1 && op->src[1]->type <= GGML_TYPE_RQ6_1);
1162-
const bool v_is_turbo = (op->src[2]->type >= GGML_TYPE_TURBO3_1 && op->src[2]->type <= GGML_TYPE_RQ6_1);
1163-
if (!(k_is_turbo && v_is_turbo)) {
1160+
// Only allow cross-type combos that have FA kernel instantiations
1161+
const enum ggml_type tk = op->src[1]->type;
1162+
const enum ggml_type tv = op->src[2]->type;
1163+
const bool valid_cross =
1164+
(tk == GGML_TYPE_TURBO5_1 && tv == GGML_TYPE_TURBO4_1) ||
1165+
(tk == GGML_TYPE_TURBO4_1 && tv == GGML_TYPE_TURBO3_1) ||
1166+
(tk == GGML_TYPE_RQ5_1 && tv == GGML_TYPE_RQ4_1) ||
1167+
(tk == GGML_TYPE_RQ4_1 && tv == GGML_TYPE_RQ3_1) ||
1168+
(tk == GGML_TYPE_TURBO3_0 && tv == GGML_TYPE_TURBO2_0) ||
1169+
(tk == GGML_TYPE_TURBO2_0 && tv == GGML_TYPE_TURBO3_0);
1170+
if (!valid_cross) {
11641171
return false;
11651172
}
11661173
}
@@ -1259,6 +1266,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
12591266
case GGML_TYPE_RQ4_1:
12601267
case GGML_TYPE_RQ5_1:
12611268
case GGML_TYPE_RQ6_1:
1269+
case GGML_TYPE_TURBO3_0:
1270+
case GGML_TYPE_TURBO2_0:
1271+
case GGML_TYPE_TURBO4_0:
12621272
return true;
12631273
default:
12641274
return false;

ggml/src/ggml-metal/ggml-metal-ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2838,6 +2838,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
28382838

28392839
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
28402840

2841+
GGML_ASSERT(pipeline.pipeline && "FA non-vec pipeline is null - missing kernel instantiation for this K/V type combo");
28412842
ggml_metal_encoder_set_pipeline(enc, pipeline);
28422843
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
28432844
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
@@ -2976,6 +2977,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
29762977

29772978
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
29782979

2980+
GGML_ASSERT(pipeline.pipeline && "FA vec pipeline is null - missing kernel instantiation for this K/V type combo");
29792981
GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
29802982

29812983
ggml_metal_encoder_set_pipeline(enc, pipeline);

0 commit comments

Comments
 (0)