quantumaikr
diff --git a/‎src/backend/cpu/tq_neon.c‎
Lines changed: 156 additions & 166 deletions b/‎src/backend/cpu/tq_neon.c‎
Lines changed: 156 additions & 166 deletions
diff --git a/‎src/core/tq_context.c‎
Lines changed: 30 additions & 9 deletions b/‎src/core/tq_context.c‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎src/core/tq_mixed.c‎
Lines changed: 13 additions & 4 deletions b/‎src/core/tq_mixed.c‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/core/tq_polar.c‎
Lines changed: 44 additions & 48 deletions b/‎src/core/tq_polar.c‎
Lines changed: 44 additions & 48 deletions
diff --git a/‎src/core/tq_traits.c‎
Lines changed: 3 additions & 1 deletion b/‎src/core/tq_traits.c‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/core/tq_uniform.c‎
Lines changed: 51 additions & 26 deletions b/‎src/core/tq_uniform.c‎
Lines changed: 51 additions & 26 deletions
@@ -86,12 +86,19 @@ tq_status tq_quantize_keys(tq_context_t* ctx,
 
     pthread_mutex_lock(&ctx->mutex);
 
-    size_t type_size = TQ_TRAITS[type].type_size;
+    size_t block_size = TQ_TRAITS[type].block_size;
+    size_t type_size  = TQ_TRAITS[type].type_size;
+    int blocks_per_key = (head_dim + (int)block_size - 1) / (int)block_size;
     uint8_t* dst = (uint8_t*)out;
 
     for (int i = 0; i < n; i++) {
-        qfn(keys + i * head_dim, dst, head_dim);
-        dst += type_size;
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * (int)block_size;
+            int chunk = head_dim - offset;
+            if (chunk > (int)block_size) chunk = (int)block_size;
+            qfn(keys + i * head_dim + offset, dst, chunk);
+            dst += type_size;
+        }
     }
 
     pthread_mutex_unlock(&ctx->mutex);
@@ -110,12 +117,19 @@ tq_status tq_dequantize_keys(tq_context_t* ctx,
     tq_dequantize_fn dfn = TQ_TRAITS[type].dequantize;
     if (!dfn) return TQ_ERR_NOT_IMPL;
 
-    size_t type_size = TQ_TRAITS[type].type_size;
+    size_t block_size = TQ_TRAITS[type].block_size;
+    size_t type_size  = TQ_TRAITS[type].type_size;
+    int blocks_per_key = (head_dim + (int)block_size - 1) / (int)block_size;
     const uint8_t* src = (const uint8_t*)quantized;
 
     for (int i = 0; i < n; i++) {
-        dfn(src, out + i * head_dim, head_dim);
-        src += type_size;
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * (int)block_size;
+            int chunk = head_dim - offset;
+            if (chunk > (int)block_size) chunk = (int)block_size;
+            dfn(src, out + i * head_dim + offset, chunk);
+            src += type_size;
+        }
     }
 
     return TQ_OK;
@@ -132,12 +146,19 @@ tq_status tq_quantize_values(tq_context_t* ctx,
     tq_quantize_fn qfn = TQ_TRAITS[type].quantize;
     if (!qfn) return TQ_ERR_NOT_IMPL;
 
-    size_t type_size = TQ_TRAITS[type].type_size;
+    size_t block_size = TQ_TRAITS[type].block_size;
+    size_t type_size  = TQ_TRAITS[type].type_size;
+    int blocks_per_key = ((int)head_dim + (int)block_size - 1) / (int)block_size;
     uint8_t* dst = (uint8_t*)out;
 
     for (int i = 0; i < n; i++) {
-        qfn(values + i * head_dim, dst, head_dim);
-        dst += type_size;
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * (int)block_size;
+            int chunk = head_dim - offset;
+            if (chunk > (int)block_size) chunk = (int)block_size;
+            qfn(values + i * head_dim + offset, dst, chunk);
+            dst += type_size;
+        }
     }
 
     return TQ_OK;
 
@@ -149,12 +149,21 @@ void tq_mixed_4b8_dequantize_ref(const void* src, float* dst, int n) {
 
 void tq_mixed_4b8_attention_ref(const float* query, const void* kv,
                                  float* scores, int seq_len, int head_dim) {
-    const block_tq_mixed_4b8* blocks = (const block_tq_mixed_4b8*)kv;
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_mixed_4b8* all_blocks = (const block_tq_mixed_4b8*)kv;
+
     for (int s = 0; s < seq_len; s++) {
-        float deq[256]; /* max head_dim */
-        tq_mixed_4b8_dequantize_ref(&blocks[s], deq, head_dim);
         float dot = 0;
-        for (int d = 0; d < head_dim; d++) dot += query[d] * deq[d];
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+
+            float deq[TQ_BK];
+            tq_mixed_4b8_dequantize_ref(&all_blocks[s * blocks_per_key + b], deq, chunk);
+
+            for (int d = 0; d < chunk; d++)
+                dot += query[offset + d] * deq[d];
+        }
         scores[s] = dot;
     }
 }
@@ -136,61 +136,57 @@ void tq_polar_dequantize_ref(const void* src, float* dst, int n) {
 
 void tq_polar_attention_ref(const float* query, const void* kv_cache,
                             float* scores, int seq_len, int head_dim) {
-    /* Each key is one block_tq_polar covering head_dim elements.
-     * Instead of dequantizing each key to FP32 then computing dot product,
-     * we precompute cos/sin/radius lookup tables per block and gather by index.
-     * This matches the Triton kernel in refs/PolarQuant/models/kernel4group.py. */
-    const block_tq_polar* blocks = (const block_tq_polar*)kv_cache;
-    int pairs = head_dim / 2;
-    if (pairs > TQ_BK / 2) pairs = TQ_BK / 2;
+    /* Each key may span multiple blocks when head_dim > TQ_BK.
+     * We precompute cos/sin/radius lookup tables per block and gather by index. */
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_polar* all_blocks = (const block_tq_polar*)kv_cache;
 
     /* Theta uses 2 bits (4 levels), rho uses 2 bits (4 levels) */
     const int theta_levels = 4;
     const int rho_levels = 4;
 
     for (int s = 0; s < seq_len; s++) {
-        const block_tq_polar* block = &blocks[s];
-
-        /* Decode block parameters from FP16 */
-        float tscale = fp16_to_fp32(block->tscale);
-        float tmin   = fp16_to_fp32(block->tmn);
-        float rscale = fp16_to_fp32(block->rscale);
-        float rmin   = fp16_to_fp32(block->rmn);
-
-        /* Step 1: Precompute theta lookup tables
-         * For quantization level q: theta = tmin + (q + 0.5) * tscale
-         * Using floor-based quantization with bin-centered reconstruction
-         * matching the Triton reference kernel. */
-        float cos_lut[4], sin_lut[4];
-        for (int q = 0; q < theta_levels; q++) {
-            float theta = tmin + ((float)q + 0.5f) * tscale;
-            cos_lut[q] = cosf(theta);
-            sin_lut[q] = sinf(theta);
-        }
-
-        /* Step 2: Precompute radius lookup table */
-        float radius_lut[4];
-        for (int q = 0; q < rho_levels; q++) {
-            radius_lut[q] = rmin + ((float)q + 0.5f) * rscale;
-        }
-
-        /* Step 3: For each pair, gather from LUT by index and accumulate */
         float score = 0.0f;
-        for (int i = 0; i < pairs; i++) {
-            /* Extract packed indices (same layout as quantize/dequantize) */
-            uint8_t byte = block->indices[i / 2];
-            uint8_t packed = (i % 2 == 0) ? (byte & 0x0F) : (byte >> 4);
-            int tq = packed & 0x03;
-            int rq = (packed >> 2) & 0x03;
-
-            /* Dot product contribution from this pair:
-             * key[2i]   = radius * cos(theta)
-             * key[2i+1] = radius * sin(theta)
-             * contrib = query[2i] * radius * cos(theta) + query[2i+1] * radius * sin(theta)
-             *         = radius * (query[2i] * cos(theta) + query[2i+1] * sin(theta)) */
-            float contrib = query[2 * i] * cos_lut[tq] + query[2 * i + 1] * sin_lut[tq];
-            contrib *= radius_lut[rq];
-            score += contrib;
+
+        for (int blk = 0; blk < blocks_per_key; blk++) {
+            int offset = blk * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+            int pairs = chunk / 2;
+
+            const block_tq_polar* block = &all_blocks[s * blocks_per_key + blk];
+
+            /* Decode block parameters from FP16 */
+            float tscale = fp16_to_fp32(block->tscale);
+            float tmin   = fp16_to_fp32(block->tmn);
+            float rscale = fp16_to_fp32(block->rscale);
+            float rmin   = fp16_to_fp32(block->rmn);
+
+            /* Precompute theta lookup tables */
+            float cos_lut[4], sin_lut[4];
+            for (int q = 0; q < theta_levels; q++) {
+                float theta = tmin + ((float)q + 0.5f) * tscale;
+                cos_lut[q] = cosf(theta);
+                sin_lut[q] = sinf(theta);
+            }
+
+            /* Precompute radius lookup table */
+            float radius_lut[4];
+            for (int q = 0; q < rho_levels; q++) {
+                radius_lut[q] = rmin + ((float)q + 0.5f) * rscale;
+            }
+
+            /* For each pair, gather from LUT by index and accumulate */
+            for (int i = 0; i < pairs; i++) {
+                uint8_t byte = block->indices[i / 2];
+                uint8_t packed = (i % 2 == 0) ? (byte & 0x0F) : (byte >> 4);
+                int tq = packed & 0x03;
+                int rq = (packed >> 2) & 0x03;
+
+                float contrib = query[offset + 2 * i] * cos_lut[tq]
+                              + query[offset + 2 * i + 1] * sin_lut[tq];
+                contrib *= radius_lut[rq];
+                score += contrib;
+            }
         }
 
         scores[s] = score;
 
@@ -21,6 +21,8 @@ extern void tq_uniform_4b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_uniform_4b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_uniform_4b_attention_ref(const float* query, const void* kv,
                                          float* scores, int seq_len, int head_dim);
+extern void tq_uniform_4b_attention_int_ref(const float* query, const void* kv,
+                                             float* scores, int seq_len, int head_dim);
 extern void tq_uniform_2b_quantize_ref(const float* src, void* dst, int n);
 extern void tq_uniform_2b_dequantize_ref(const void* src, float* dst, int n);
 extern void tq_uniform_2b_attention_ref(const float* query, const void* kv,
@@ -89,7 +91,7 @@ const tq_type_traits_t TQ_TRAITS[TQ_TYPE_COUNT] = {
         .bpe        = (float)sizeof(block_tq_uniform_4b) * 8.0f / TQ_BK,
         .quantize   = tq_uniform_4b_quantize_ref,
         .dequantize = tq_uniform_4b_dequantize_ref,
-        .attention  = tq_uniform_4b_attention_ref,
+        .attention  = tq_uniform_4b_attention_int_ref,
         .residual_type = TQ_TYPE_COUNT,
     },
     [TQ_TYPE_UNIFORM_2B] = {
 
@@ -182,40 +182,56 @@ void tq_uniform_4b_attention_int_ref(const float* query, const void* kv,
     float q_scale, q_sum;
     tq_quantize_query_q8(query, q8, &q_scale, &q_sum, head_dim);
 
-    const block_tq_uniform_4b* blocks = (const block_tq_uniform_4b*)kv;
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_uniform_4b* all_blocks = (const block_tq_uniform_4b*)kv;
 
     for (int s = 0; s < seq_len; s++) {
-        float k_scale = uni_fp16_to_fp32(blocks[s].scale);
-        float k_zp    = uni_fp16_to_fp32(blocks[s].zero_point);
-        float k_offset = k_zp + 0.5f * k_scale; /* bin centering */
-
-        /* Step 2: Integer dot product (no dequantize!) */
-        int32_t isum = 0;
-        for (int i = 0; i < head_dim / 2; i++) {
-            uint8_t packed = blocks[s].qs[i];
-            int32_t q4_lo = (int32_t)(packed & 0x0F);  /* low nibble [0,15] */
-            int32_t q4_hi = (int32_t)(packed >> 4);     /* high nibble [0,15] */
-
-            isum += q4_lo * (int32_t)q8[2*i];
-            isum += q4_hi * (int32_t)q8[2*i + 1];
+        float score = 0;
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+            const block_tq_uniform_4b* block = &all_blocks[s * blocks_per_key + b];
+
+            float k_scale = uni_fp16_to_fp32(block->scale);
+            float k_zp    = uni_fp16_to_fp32(block->zero_point);
+
+            /* Integer dot product (no dequantize!) */
+            int32_t isum = 0;
+            for (int i = 0; i < chunk / 2; i++) {
+                uint8_t packed = block->qs[i];
+                isum += (int32_t)(packed & 0x0F) * (int32_t)q8[offset + 2*i];
+                isum += (int32_t)(packed >> 4)   * (int32_t)q8[offset + 2*i + 1];
+            }
+
+            /* Partial query sum for this block's zero-point correction */
+            float block_q_sum = 0;
+            for (int d = 0; d < chunk; d++) block_q_sum += query[offset + d];
+
+            score += (float)isum * k_scale * q_scale + (k_zp + 0.5f * k_scale) * block_q_sum;
         }
-
-        /* Step 3: Convert to float ONCE with combined scale
-         * dot ~ k_scale * q_scale * isum + k_offset * q_sum */
-        scores[s] = (float)isum * k_scale * q_scale + k_offset * q_sum;
+        scores[s] = score;
     }
 }
 
 /* ---------- Uniform 4-bit attention (dequantize + dot product) ---------- */
 
 void tq_uniform_4b_attention_ref(const float* query, const void* kv,
                                   float* scores, int seq_len, int head_dim) {
-    const block_tq_uniform_4b* blocks = (const block_tq_uniform_4b*)kv;
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_uniform_4b* all_blocks = (const block_tq_uniform_4b*)kv;
+
     for (int s = 0; s < seq_len; s++) {
-        float deq[256]; /* max head_dim */
-        tq_uniform_4b_dequantize_ref(&blocks[s], deq, head_dim);
         float dot = 0;
-        for (int d = 0; d < head_dim; d++) dot += query[d] * deq[d];
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+
+            float deq[TQ_BK];
+            tq_uniform_4b_dequantize_ref(&all_blocks[s * blocks_per_key + b], deq, chunk);
+
+            for (int d = 0; d < chunk; d++)
+                dot += query[offset + d] * deq[d];
+        }
         scores[s] = dot;
     }
 }
@@ -224,12 +240,21 @@ void tq_uniform_4b_attention_ref(const float* query, const void* kv,
 
 void tq_uniform_2b_attention_ref(const float* query, const void* kv,
                                   float* scores, int seq_len, int head_dim) {
-    const block_tq_uniform_2b* blocks = (const block_tq_uniform_2b*)kv;
+    int blocks_per_key = (head_dim + TQ_BK - 1) / TQ_BK;
+    const block_tq_uniform_2b* all_blocks = (const block_tq_uniform_2b*)kv;
+
     for (int s = 0; s < seq_len; s++) {
-        float deq[256]; /* max head_dim */
-        tq_uniform_2b_dequantize_ref(&blocks[s], deq, head_dim);
         float dot = 0;
-        for (int d = 0; d < head_dim; d++) dot += query[d] * deq[d];
+        for (int b = 0; b < blocks_per_key; b++) {
+            int offset = b * TQ_BK;
+            int chunk = (head_dim - offset > TQ_BK) ? TQ_BK : (head_dim - offset);
+
+            float deq[TQ_BK];
+            tq_uniform_2b_dequantize_ref(&all_blocks[s * blocks_per_key + b], deq, chunk);
+
+            for (int d = 0; d < chunk; d++)
+                dot += query[offset + d] * deq[d];
+        }
         scores[s] = dot;
     }
 }