quantumaikr
diff --git a/‎include/turboquant/tq_engine.h‎
Lines changed: 30 additions & 0 deletions b/‎include/turboquant/tq_engine.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎include/turboquant/tq_types.h‎
Lines changed: 16 additions & 1 deletion b/‎include/turboquant/tq_types.h‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎integrations/llamacpp/tq_kv_cache.cpp‎
Lines changed: 17 additions & 1 deletion b/‎integrations/llamacpp/tq_kv_cache.cpp‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎src/core/tq_traits.c‎
Lines changed: 17 additions & 0 deletions b/‎src/core/tq_traits.c‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/core/tq_turbo_kv.c‎
Lines changed: 171 additions & 2 deletions b/‎src/core/tq_turbo_kv.c‎
Lines changed: 171 additions & 2 deletions
@@ -106,6 +106,25 @@ typedef struct {
     uint8_t* delta_in_proj_b_q4;   float* delta_in_proj_b_q4s;
     uint8_t* delta_out_proj_q4;    float* delta_out_proj_q4s;
 
+    /* Q2_0 quantized weights: packed 2-bit data + per-block float scale (block_size=32)
+     * Each block of 32 values stored as 8 packed bytes + 1 float scale.
+     * Uses Lloyd-Max codebook: centroid indices {0,1,2,3} -> {-1.510, -0.453, 0.453, 1.510}
+     * When use_q2 is set, these replace FP32 pointers (set to NULL). */
+    uint8_t* wq_q2;     float* wq_q2s;    /* Q2 q_proj */
+    uint8_t* wk_q2;     float* wk_q2s;    /* Q2 k_proj */
+    uint8_t* wv_q2;     float* wv_q2s;    /* Q2 v_proj */
+    uint8_t* wo_q2;     float* wo_q2s;    /* Q2 o_proj */
+    uint8_t* w_gate_q2; float* w_gate_q2s;/* Q2 gate_proj */
+    uint8_t* w_up_q2;   float* w_up_q2s;  /* Q2 up_proj */
+    uint8_t* w_down_q2; float* w_down_q2s;/* Q2 down_proj */
+
+    /* DeltaNet Q2 weights */
+    uint8_t* delta_in_proj_qkv_q2; float* delta_in_proj_qkv_q2s;
+    uint8_t* delta_in_proj_z_q2;   float* delta_in_proj_z_q2s;
+    uint8_t* delta_in_proj_a_q2;   float* delta_in_proj_a_q2s;
+    uint8_t* delta_in_proj_b_q2;   float* delta_in_proj_b_q2s;
+    uint8_t* delta_out_proj_q2;    float* delta_out_proj_q2s;
+
     /* DeltaNet (linear_attention) weights (NULL for self_attn layers) */
     float* delta_a_log;       /* [delta_n_heads] decay parameter (log scale) */
     float* delta_conv1d;      /* [qkv_dim, 1, conv_width] */
@@ -157,6 +176,11 @@ typedef struct {
     void* _q4_data;           /* heap buffer for all Q4 quantized weights */
     size_t _q4_size;
 
+    /* Q2 weight quantization */
+    int use_q2_weights;       /* 1 if layer weights are Q2-quantized */
+    void* _q2_data;           /* heap buffer for all Q2 quantized weights */
+    size_t _q2_size;
+
     /* Memory management — supports multi-shard safetensors */
 #define TQ_MAX_SHARDS 16
     void* _mmap_data;         /* primary mmap (shard 0 or TQM file) */
@@ -368,6 +392,12 @@ void tq_matmul_q4_preq(float* out, const uint8_t* w_qs, const float* w_scales,
                         const int8_t* x_q8, const float* x_scales, int n, int d);
 void tq_quantize_row_q4(const float* src, uint8_t* dst_qs, float* dst_scales, int n);
 void tq_quantize_weights_q4(tq_model_t* model);
+void tq_matmul_q2(float* out, const float* x, const uint8_t* w_qs, const float* w_scales,
+                   int n, int d);
+void tq_matmul_q2_preq(float* out, const uint8_t* w_qs, const float* w_scales,
+                        const int8_t* x_q8, const float* x_scales, int n, int d);
+void tq_quantize_row_q2(const float* src, uint8_t* dst_qs, float* dst_scales, int n);
+void tq_quantize_weights_q2(tq_model_t* model);
 void tq_rmsnorm(float* out, const float* x, const float* weight, int n, float eps);
 void tq_rope(float* q, float* k, int pos, int head_dim,
              int n_heads, int n_kv_heads, float freq_base);
 
@@ -51,7 +51,8 @@ typedef enum {
     TQ_TYPE_MIXED_4B8 = 7,   /* Mixed: 4-bit base + fp16 outliers */
     TQ_TYPE_TURBO_KV_3B = 8, /* TurboQuant KV: 2-bit codebook + 1-bit QJL residual */
     TQ_TYPE_TURBO_KV_4B = 9, /* TurboQuant KV: 3-bit codebook + 1-bit QJL residual */
-    TQ_TYPE_COUNT     = 10
+    TQ_TYPE_TURBO_KV_1B = 10,/* TurboQuant KV: 1-bit Hamming (sign only)           */
+    TQ_TYPE_COUNT     = 11
 } tq_type;
 
 /* ============================================================
@@ -202,6 +203,19 @@ typedef struct {
     uint8_t  qjl_signs[TQ_BK / 8];        /* 1-bit QJL sign hash on residual (16B) */
 } block_tq_turbo_kv_4b;
 
+/* TurboQuant KV cache block: 1-bit Hamming attention
+ * Pure sign-bit quantization for extreme compression.
+ * Pipeline: normalize -> RHT -> sign extraction (1 bit per dim).
+ * Attention uses XOR + popcount for Hamming distance.
+ * For dim=128: 2 + 2 + 4 + 16 = 24 bytes per key (vs 256 bytes FP16 = 10.7x compression).
+ */
+typedef struct {
+    uint16_t norm;              /* L2 norm of original vector (fp16)  */
+    uint16_t _pad;              /* alignment padding                  */
+    uint32_t rht_seed;          /* RHT random seed for this block     */
+    uint8_t  signs[TQ_BK / 8]; /* 1 bit per dim = 16 bytes for 128   */
+} block_tq_turbo_kv_1b;
+
 /* ============================================================
  * Block size verification (compile-time, C/C++ compatible)
  * Uses negative-size array trick for universal compatibility.
@@ -216,5 +230,6 @@ TQ_CHECK_SIZE(block_tq_uniform_2b, 4 + TQ_BK / 4);
 TQ_CHECK_SIZE(block_tq_mixed_4b8, 4 + TQ_MIXED_OUTLIERS + TQ_MIXED_OUTLIERS * 2 + TQ_BK / 2);
 TQ_CHECK_SIZE(block_tq_turbo_kv_3b, 8 + TQ_BK / 4 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
+TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 
 #endif /* TQ_TYPES_H */
@@ -42,7 +42,8 @@ enum {
     GGML_TYPE_TQ_MIXED_4B8     = GGML_TYPE_TQ_BASE + 7,
     GGML_TYPE_TQ_TURBO_KV_3B  = GGML_TYPE_TQ_BASE + 8,
     GGML_TYPE_TQ_TURBO_KV_4B  = GGML_TYPE_TQ_BASE + 9,
-    GGML_TYPE_TQ_COUNT         = 10,
+    GGML_TYPE_TQ_TURBO_KV_1B  = GGML_TYPE_TQ_BASE + 10,
+    GGML_TYPE_TQ_COUNT         = 11,
 };
 
 /* ============================================================
@@ -61,6 +62,7 @@ static int tq_to_ggml_type(tq_type type) {
         case TQ_TYPE_MIXED_4B8:     return GGML_TYPE_TQ_MIXED_4B8;
         case TQ_TYPE_TURBO_KV_3B:  return GGML_TYPE_TQ_TURBO_KV_3B;
         case TQ_TYPE_TURBO_KV_4B:  return GGML_TYPE_TQ_TURBO_KV_4B;
+        case TQ_TYPE_TURBO_KV_1B:  return GGML_TYPE_TQ_TURBO_KV_1B;
         default: return -1;
     }
 }
@@ -77,6 +79,7 @@ static tq_type ggml_to_tq_type(int ggml_id) {
         case GGML_TYPE_TQ_MIXED_4B8:     return TQ_TYPE_MIXED_4B8;
         case GGML_TYPE_TQ_TURBO_KV_3B:  return TQ_TYPE_TURBO_KV_3B;
         case GGML_TYPE_TQ_TURBO_KV_4B:  return TQ_TYPE_TURBO_KV_4B;
+        case GGML_TYPE_TQ_TURBO_KV_1B:  return TQ_TYPE_TURBO_KV_1B;
         default: return TQ_TYPE_COUNT;
     }
 }
@@ -139,6 +142,7 @@ TQ_GGML_WRAPPERS(uniform_2b, TQ_TYPE_UNIFORM_2B)
 TQ_GGML_WRAPPERS(mixed_4b8,     TQ_TYPE_MIXED_4B8)
 TQ_GGML_WRAPPERS(turbo_kv_3b,  TQ_TYPE_TURBO_KV_3B)
 TQ_GGML_WRAPPERS(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
+TQ_GGML_WRAPPERS(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 
 /* ============================================================
  * vec_dot wrappers (quantized key . FP32 query -> scalar)
@@ -189,6 +193,7 @@ TQ_GGML_VEC_DOT(uniform_2b, TQ_TYPE_UNIFORM_2B)
 TQ_GGML_VEC_DOT(mixed_4b8,     TQ_TYPE_MIXED_4B8)
 TQ_GGML_VEC_DOT(turbo_kv_3b,  TQ_TYPE_TURBO_KV_3B)
 TQ_GGML_VEC_DOT(turbo_kv_4b,  TQ_TYPE_TURBO_KV_4B)
+TQ_GGML_VEC_DOT(turbo_kv_1b,  TQ_TYPE_TURBO_KV_1B)
 
 /* ============================================================
  * GGML type trait table
@@ -288,6 +293,14 @@ static const tq_ggml_type_trait TQ_GGML_TRAITS[GGML_TYPE_TQ_COUNT] = {
         tq_ggml_to_float_turbo_kv_4b,
         tq_ggml_vec_dot_turbo_kv_4b,
     },
+    {
+        "tq_turbo_kv_1b", GGML_TYPE_TQ_TURBO_KV_1B, TQ_TYPE_TURBO_KV_1B,
+        sizeof(block_tq_turbo_kv_1b), TQ_BK,
+        (float)sizeof(block_tq_turbo_kv_1b) * 8.0f / TQ_BK,
+        tq_ggml_from_float_turbo_kv_1b,
+        tq_ggml_to_float_turbo_kv_1b,
+        tq_ggml_vec_dot_turbo_kv_1b,
+    },
 };
 
 #define TQ_GGML_NUM_TYPES (sizeof(TQ_GGML_TRAITS) / sizeof(TQ_GGML_TRAITS[0]))
@@ -381,6 +394,9 @@ tq_type tq_parse_kv_cache_type(const char* arg) {
         { "turbo_kv_4b",    TQ_TYPE_TURBO_KV_4B },
         { "tq-turbo-kv-4b", TQ_TYPE_TURBO_KV_4B },
         { "turbokv4",       TQ_TYPE_TURBO_KV_4B },
+        { "turbo_kv_1b",    TQ_TYPE_TURBO_KV_1B },
+        { "tq-turbo-kv-1b", TQ_TYPE_TURBO_KV_1B },
+        { "turbokv1",       TQ_TYPE_TURBO_KV_1B },
     };
 
     for (size_t i = 0; i < sizeof(map) / sizeof(map[0]); i++) {
 
@@ -43,6 +43,11 @@ extern void tq_turbo_kv_4b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv,
                                           float* scores, int seq_len, int head_dim);
 
+extern void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n);
+extern void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n);
+extern void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv,
+                                          float* scores, int seq_len, int head_dim);
+
 const tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
     [TQ_TYPE_POLAR_3B] = {
         .name       = "polar_3b",
@@ -144,6 +149,16 @@ const tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .attention  = tq_turbo_kv_4b_attention_ref,
         .residual_type = TQ_TYPE_QJL_1B,
     },
+    [TQ_TYPE_TURBO_KV_1B] = {
+        .name       = "turbo_kv_1b",
+        .block_size = TQ_BK,
+        .type_size  = sizeof(block_tq_turbo_kv_1b),
+        .bpe        = (float)sizeof(block_tq_turbo_kv_1b) * 8.0f / TQ_BK,
+        .quantize   = tq_turbo_kv_1b_quantize_ref,
+        .dequantize = tq_turbo_kv_1b_dequantize_ref,
+        .attention  = tq_turbo_kv_1b_attention_ref,
+        .residual_type = TQ_TYPE_COUNT, /* none */
+    },
 };
 
 const char* tq_type_name(tq_type type) {
@@ -214,6 +229,8 @@ tq_format_spec_t tq_get_format_spec(tq_type type) {
         case TQ_TYPE_TURBO_KV_4B:
             spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 4;
             spec.flags = TQ_FLAG_HAS_RESIDUAL; break;
+        case TQ_TYPE_TURBO_KV_1B:
+            spec.algorithm = TQ_ALG_TURBO; spec.key_bits = 1; break;
         default: break;
     }
     return spec;
 
@@ -493,7 +493,7 @@ void tq_turbo_kv_4b_dequantize_ref(const void* src, float* dst, int n) {
 
 void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache,
                                     float* scores, int seq_len, int head_dim) {
-    const block_tq_turbo_kv_4b* blocks = (const block_tq_turbo_kv_4b*)kv_cache;
+    const block_tq_turbo_kv_4b* blocks_4b = (const block_tq_turbo_kv_4b*)kv_cache;
     int dim = head_dim;
     if (dim > TQ_BK) dim = TQ_BK;
 
@@ -517,7 +517,7 @@ void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache,
     }
 
     for (int seq = 0; seq < seq_len; seq++) {
-        const block_tq_turbo_kv_4b* block = &blocks[seq];
+        const block_tq_turbo_kv_4b* block = &blocks_4b[seq];
         float norm = tkv_fp16_to_fp32(block->norm);
         float r_norm = tkv_fp16_to_fp32(block->residual_norm);
 
@@ -604,3 +604,172 @@ void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache,
         scores[seq] = norm * mse_dot + norm * qjl_correction;
     }
 }
+
+/* ============================================================
+ * TurboQuant KV 1-bit: quantize
+ *
+ * Extreme compression: normalize -> RHT -> sign extraction.
+ * Each dimension is stored as a single sign bit.
+ * For dim=128: 24 bytes total (8 header + 16 sign bytes).
+ * Compression ratio: 128*4 / 24 = 21.3x vs FP32.
+ * ============================================================ */
+
+void tq_turbo_kv_1b_quantize_ref(const float* src, void* dst, int n) {
+    block_tq_turbo_kv_1b* block = (block_tq_turbo_kv_1b*)dst;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    /* Step 1: Compute L2 norm */
+    float norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        norm_sq += src[i] * src[i];
+    }
+    float norm = sqrtf(norm_sq);
+    block->norm = tkv_fp32_to_fp16(norm);
+    block->_pad = 0;
+
+    /* Step 2: Normalize and copy to working buffer */
+    float rotated[TQ_BK];
+    float inv_norm = (norm > 1e-10f) ? (1.0f / norm) : 0.0f;
+    for (int i = 0; i < dim; i++) {
+        rotated[i] = src[i] * inv_norm;
+    }
+    for (int i = dim; i < TQ_BK; i++) {
+        rotated[i] = 0.0f;
+    }
+
+    /* Step 3: Apply RHT (in-place on rotated) */
+    uint32_t seed = TKV_DEFAULT_SEED;
+    block->rht_seed = seed;
+    tq_rht_transform(rotated, dim, seed);
+
+    /* Step 4: Extract sign bits -- 1 bit per dimension */
+    int sign_bytes = dim / 8;
+    memset(block->signs, 0, (size_t)sign_bytes);
+    for (int i = 0; i < dim; i++) {
+        if (rotated[i] >= 0.0f) {
+            block->signs[i / 8] |= (uint8_t)(1 << (i % 8));
+        }
+    }
+}
+
+/* ============================================================
+ * TurboQuant KV 1-bit: dequantize (rough reconstruction)
+ *
+ * Reconstruct: sign * (norm / sqrt(dim)) then inverse RHT.
+ * This is a very rough reconstruction -- the real value of 1-bit
+ * is in Hamming attention, not point-wise dequant.
+ * ============================================================ */
+
+void tq_turbo_kv_1b_dequantize_ref(const void* src, float* dst, int n) {
+    const block_tq_turbo_kv_1b* block = (const block_tq_turbo_kv_1b*)src;
+    int dim = n;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float norm = tkv_fp16_to_fp32(block->norm);
+    uint32_t seed = block->rht_seed;
+
+    /* Reconstruct sign vector in rotated space.
+     * After RHT, coordinates are ~N(0, 1/sqrt(dim)).
+     * Expected |x| for half-normal = sqrt(2/pi) * sigma = sqrt(2/pi) / sqrt(dim).
+     * So sign * sqrt(2/pi) / sqrt(dim) is the expected reconstruction. */
+    float scale = sqrtf(2.0f / TQ_PI) / sqrtf((float)dim);
+    float rotated[TQ_BK];
+    for (int i = 0; i < dim; i++) {
+        int bit = (block->signs[i / 8] >> (i % 8)) & 1;
+        rotated[i] = bit ? scale : -scale;
+    }
+
+    /* Inverse RHT */
+    tq_rht_inverse(rotated, dim, seed);
+
+    /* Scale by original norm */
+    for (int i = 0; i < dim; i++) {
+        dst[i] = rotated[i] * norm;
+    }
+}
+
+/* ============================================================
+ * TurboQuant KV 1-bit: attention (XOR + popcount Hamming)
+ *
+ * Ultra-fast attention using bitwise operations:
+ *   1. RHT(query) computed ONCE
+ *   2. Extract query sign bits ONCE
+ *   3. Per key: XOR + popcount -> Hamming distance -> score
+ *
+ * The inner product estimator:
+ *   <q, k> ~ q_norm * k_norm * sqrt(pi/2) / dim * (2*agree - dim)
+ * where agree = dim - hamming_distance(q_signs, k_signs).
+ *
+ * NEON vectorization for popcount with scalar fallback.
+ * ============================================================ */
+
+void tq_turbo_kv_1b_attention_ref(const float* query, const void* kv_cache,
+                                    float* scores, int seq_len, int head_dim) {
+    const block_tq_turbo_kv_1b* blocks = (const block_tq_turbo_kv_1b*)kv_cache;
+    int dim = head_dim;
+    if (dim > TQ_BK) dim = TQ_BK;
+
+    float scale_factor = sqrtf(TQ_PI_2) / (float)dim;
+
+    /* Step 1: RHT(query) computed ONCE */
+    float q_rot[TQ_BK];
+    memcpy(q_rot, query, (size_t)dim * sizeof(float));
+    for (int i = dim; i < TQ_BK; i++) q_rot[i] = 0.0f;
+    tq_rht_transform(q_rot, dim, TKV_DEFAULT_SEED);
+
+    /* Step 2: Compute query L2 norm */
+    float q_norm_sq = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        q_norm_sq += query[i] * query[i];
+    }
+    float q_norm = sqrtf(q_norm_sq);
+
+    /* Step 3: Extract query sign bits */
+    int sign_bytes = dim / 8;
+    uint8_t q_signs[TQ_BK / 8];
+    memset(q_signs, 0, (size_t)sign_bytes);
+    for (int i = 0; i < dim; i++) {
+        if (q_rot[i] >= 0.0f) {
+            q_signs[i / 8] |= (uint8_t)(1 << (i % 8));
+        }
+    }
+
+    /* Step 4: Per-key Hamming attention */
+    for (int seq = 0; seq < seq_len; seq++) {
+        const block_tq_turbo_kv_1b* blk = &blocks[seq];
+        float k_norm = tkv_fp16_to_fp32(blk->norm);
+
+        /* XOR + popcount to get Hamming distance */
+        int hamming = 0;
+#ifdef __ARM_NEON
+        if (sign_bytes == 16) {
+            /* Optimized path for dim=128 (16 sign bytes) */
+            uint8x16_t vq = vld1q_u8(q_signs);
+            uint8x16_t vk = vld1q_u8(blk->signs);
+            uint8x16_t vxor = veorq_u8(vq, vk);
+            /* Count bits: use NEON vcntq_u8 for byte-level popcount */
+            uint8x16_t vcnt = vcntq_u8(vxor);
+            /* Horizontal sum of all byte popcounts */
+            hamming = vaddlvq_u8(vcnt);
+        } else {
+            for (int b = 0; b < sign_bytes; b++) {
+                uint8_t xor_byte = q_signs[b] ^ blk->signs[b];
+                hamming += __builtin_popcount(xor_byte);
+            }
+        }
+#else
+        for (int b = 0; b < sign_bytes; b++) {
+            uint8_t xor_byte = q_signs[b] ^ blk->signs[b];
+            /* Portable popcount using Kernighan's bit trick */
+            int c = 0;
+            while (xor_byte) { c++; xor_byte &= xor_byte - 1; }
+            hamming += c;
+        }
+#endif
+
+        int agree = dim - hamming;
+        float score = q_norm * k_norm * scale_factor * (float)(2 * agree - dim);
+        scores[seq] = score;
+    }
+}