luvwinnie
diff --git a/‎common/arg.cpp‎
Lines changed: 3 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 4 additions & 1 deletion b/‎ggml/include/ggml.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎ggml/src/ggml-common.h‎
Lines changed: 27 additions & 0 deletions b/‎ggml/src/ggml-common.h‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 18 additions & 0 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/quants.c‎
Lines changed: 71 additions & 0 deletions b/‎ggml/src/ggml-cpu/quants.c‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/quants.h‎
Lines changed: 8 additions & 0 deletions b/‎ggml/src/ggml-cpu/quants.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ggml/src/ggml-metal/ggml-metal-device.m‎
Lines changed: 14 additions & 4 deletions b/‎ggml/src/ggml-metal/ggml-metal-device.m‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎ggml/src/ggml-metal/ggml-metal-ops.cpp‎
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-metal/ggml-metal-ops.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -395,6 +395,9 @@ const std::vector<ggml_type> kv_cache_types = {
     GGML_TYPE_RQ4_1,
     GGML_TYPE_RQ5_1,
     GGML_TYPE_RQ6_1,
+    GGML_TYPE_TURBO3_0,
+    GGML_TYPE_TURBO2_0,
+    GGML_TYPE_TURBO4_0,
 };
 
 static ggml_type kv_cache_type_from_str(const std::string & s) {
 
@@ -436,7 +436,10 @@ extern "C" {
         GGML_TYPE_RQ4_1    = 46, // RotorQuant 3-bit dk64: Clifford rotor + Lloyd-Max
         GGML_TYPE_RQ5_1    = 47, // RotorQuant 4-bit dk64: Clifford rotor + Lloyd-Max
         GGML_TYPE_RQ6_1    = 48, // RotorQuant 5-bit dk64: Clifford rotor + Lloyd-Max
-        GGML_TYPE_COUNT    = 49,
+        GGML_TYPE_TURBO3_0 = 49, // TurboQuant 3-bit WHT (32-element blocks)
+        GGML_TYPE_TURBO2_0 = 50, // TurboQuant 2-bit WHT (32-element blocks)
+        GGML_TYPE_TURBO4_0 = 51, // TurboQuant 4-bit WHT pure PolarQuant (32-element blocks, 16 centroids)
+        GGML_TYPE_COUNT    = 52,
     };
 
     // precision
 
@@ -270,6 +270,33 @@ static_assert(sizeof(block_rq5_1) == sizeof(ggml_half) + 32, "wrong rq5_1 block
 typedef struct { ggml_half norm; uint8_t qs[40]; } block_rq6_1;
 static_assert(sizeof(block_rq6_1) == sizeof(ggml_half) + 40, "wrong rq6_1 block size");
 
+// TurboQuant WHT-rotated types (block size 32, rotation group 128)
+#define QK_TURBO3_0 32
+#define QK_TURBO3_0_GROUP 128
+typedef struct {
+    ggml_half norm;
+    uint8_t qs[QK_TURBO3_0 / 4];     // 8 bytes: lower 2-bit
+    uint8_t signs[QK_TURBO3_0 / 8];  // 4 bytes: upper 1-bit
+} block_turbo3_0;  // 14 bytes
+static_assert(sizeof(block_turbo3_0) == sizeof(ggml_half) + QK_TURBO3_0/4 + QK_TURBO3_0/8, "wrong turbo3_0 block size");
+
+#define QK_TURBO2_0 32
+#define QK_TURBO2_0_GROUP 128
+typedef struct {
+    ggml_half norm;
+    uint8_t qs[QK_TURBO2_0 / 4];  // 8 bytes: 2-bit indices
+} block_turbo2_0;  // 10 bytes
+static_assert(sizeof(block_turbo2_0) == sizeof(ggml_half) + QK_TURBO2_0/4, "wrong turbo2_0 block size");
+
+// TurboQuant 4-bit WHT-rotated (pure PolarQuant, 16 centroids, no QJL)
+#define QK_TURBO4_0 32
+#define QK_TURBO4_0_GROUP 128
+typedef struct {
+    ggml_half norm;
+    uint8_t qs[QK_TURBO4_0 / 2];  // 16 bytes: 4-bit indices packed 2 per byte
+} block_turbo4_0;  // 18 bytes
+static_assert(sizeof(block_turbo4_0) == sizeof(ggml_half) + QK_TURBO4_0/2, "wrong turbo4_0 block size");
+
 //
 // Ternary quantization
 //
 
@@ -438,6 +438,24 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
         .nrows                    = 1,
     },
+    [GGML_TYPE_TURBO3_0] = {
+        .from_float               = quantize_row_turbo3_0,
+        .vec_dot                  = ggml_vec_dot_turbo3_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TURBO2_0] = {
+        .from_float               = quantize_row_turbo2_0,
+        .vec_dot                  = ggml_vec_dot_turbo2_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
+    [GGML_TYPE_TURBO4_0] = {
+        .from_float               = quantize_row_turbo4_0,
+        .vec_dot                  = ggml_vec_dot_turbo4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_I32] = {
         .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
     },
 
@@ -1435,3 +1435,74 @@ void ggml_vec_dot_rq6_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
     }
     *s = sumf;
 }
+
+// ============================ TurboQuant WHT wrappers (turbo3_0, turbo2_0)
+
+void quantize_row_turbo3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_turbo3_0_ref(x, y, k);
+}
+
+void quantize_row_turbo2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_turbo2_0_ref(x, y, k);
+}
+
+void ggml_vec_dot_turbo3_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+    assert(nrc == 1);
+    const int nb = n / QK_TURBO3_0;
+    float sumf = 0.0f;
+    float tmp_x[QK_TURBO3_0];
+    float tmp_y[QK8_0];
+    const block_turbo3_0 * GGML_RESTRICT x = (const block_turbo3_0 *)vx;
+    const block_q8_0     * GGML_RESTRICT y = (const block_q8_0 *)vy;
+    for (int i = 0; i < nb; i++) {
+        dequantize_row_turbo3_0(&x[i], tmp_x, QK_TURBO3_0);
+        dequantize_row_q8_0(&y[i], tmp_y, QK8_0);
+        for (int k = 0; k < QK8_0; k++) {
+            sumf += tmp_x[k] * tmp_y[k];
+        }
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_turbo2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+    assert(nrc == 1);
+    const int nb = n / QK_TURBO2_0;
+    float sumf = 0.0f;
+    float tmp_x[QK_TURBO2_0];
+    float tmp_y[QK8_0];
+    const block_turbo2_0 * GGML_RESTRICT x = (const block_turbo2_0 *)vx;
+    const block_q8_0     * GGML_RESTRICT y = (const block_q8_0 *)vy;
+    for (int i = 0; i < nb; i++) {
+        dequantize_row_turbo2_0(&x[i], tmp_x, QK_TURBO2_0);
+        dequantize_row_q8_0(&y[i], tmp_y, QK8_0);
+        for (int k = 0; k < QK8_0; k++) {
+            sumf += tmp_x[k] * tmp_y[k];
+        }
+    }
+    *s = sumf;
+}
+
+void quantize_row_turbo4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_turbo4_0_ref(x, y, k);
+}
+
+void ggml_vec_dot_turbo4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    GGML_UNUSED(bs); GGML_UNUSED(bx); GGML_UNUSED(by); GGML_UNUSED(nrc);
+    assert(nrc == 1);
+    const int nb = n / QK_TURBO4_0;
+    float sumf = 0.0f;
+    float tmp_x[QK_TURBO4_0];
+    float tmp_y[QK8_0];
+    const block_turbo4_0 * GGML_RESTRICT x_b = (const block_turbo4_0 *)vx;
+    const block_q8_0     * GGML_RESTRICT y_b = (const block_q8_0 *)vy;
+    for (int i = 0; i < nb; i++) {
+        dequantize_row_turbo4_0(&x_b[i], tmp_x, QK_TURBO4_0);
+        dequantize_row_q8_0(&y_b[i], tmp_y, QK8_0);
+        for (int k = 0; k < QK8_0; k++) {
+            sumf += tmp_x[k] * tmp_y[k];
+        }
+    }
+    *s = sumf;
+}
@@ -37,6 +37,10 @@ void quantize_row_turbo4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y
 void quantize_row_turbo5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_turbo6_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
+void quantize_row_turbo3_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_turbo2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_turbo4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
 void quantize_row_rq3_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_rq4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_rq5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -74,6 +78,10 @@ void ggml_vec_dot_rq4_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_vec_dot_rq5_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_rq6_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
+void ggml_vec_dot_turbo3_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_turbo2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_turbo4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
@@ -1157,10 +1157,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                 return false;
             }
             if (op->src[1]->type != op->src[2]->type) {
-                // Allow mixed turbo/rq types (auto-asymmetric K/V)
-                const bool k_is_turbo = (op->src[1]->type >= GGML_TYPE_TURBO3_1 && op->src[1]->type <= GGML_TYPE_RQ6_1);
-                const bool v_is_turbo = (op->src[2]->type >= GGML_TYPE_TURBO3_1 && op->src[2]->type <= GGML_TYPE_RQ6_1);
-                if (!(k_is_turbo && v_is_turbo)) {
+                // Only allow cross-type combos that have FA kernel instantiations
+                const enum ggml_type tk = op->src[1]->type;
+                const enum ggml_type tv = op->src[2]->type;
+                const bool valid_cross =
+                    (tk == GGML_TYPE_TURBO5_1 && tv == GGML_TYPE_TURBO4_1) ||
+                    (tk == GGML_TYPE_TURBO4_1 && tv == GGML_TYPE_TURBO3_1) ||
+                    (tk == GGML_TYPE_RQ5_1    && tv == GGML_TYPE_RQ4_1)    ||
+                    (tk == GGML_TYPE_RQ4_1    && tv == GGML_TYPE_RQ3_1)    ||
+                    (tk == GGML_TYPE_TURBO3_0 && tv == GGML_TYPE_TURBO2_0) ||
+                    (tk == GGML_TYPE_TURBO2_0 && tv == GGML_TYPE_TURBO3_0);
+                if (!valid_cross) {
                     return false;
                 }
             }
@@ -1259,6 +1266,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                     case GGML_TYPE_RQ4_1:
                     case GGML_TYPE_RQ5_1:
                     case GGML_TYPE_RQ6_1:
+                    case GGML_TYPE_TURBO3_0:
+                    case GGML_TYPE_TURBO2_0:
+                    case GGML_TYPE_TURBO4_0:
                         return true;
                     default:
                         return false;
 
@@ -2838,6 +2838,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
 
         auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
 
+        GGML_ASSERT(pipeline.pipeline && "FA non-vec pipeline is null - missing kernel instantiation for this K/V type combo");
         ggml_metal_encoder_set_pipeline(enc, pipeline);
         ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
         ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
@@ -2976,6 +2977,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
 
         auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
 
+        GGML_ASSERT(pipeline.pipeline && "FA vec pipeline is null - missing kernel instantiation for this K/V type combo");
         GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
 
         ggml_metal_encoder_set_pipeline(enc, pipeline);