@@ -493,7 +493,7 @@ void tq_turbo_kv_4b_dequantize_ref(const void* src, float* dst, int n) {
493493
494494void tq_turbo_kv_4b_attention_ref (const float * query , const void * kv_cache ,
495495 float * scores , int seq_len , int head_dim ) {
496- const block_tq_turbo_kv_4b * blocks = (const block_tq_turbo_kv_4b * )kv_cache ;
496+ const block_tq_turbo_kv_4b * blocks_4b = (const block_tq_turbo_kv_4b * )kv_cache ;
497497 int dim = head_dim ;
498498 if (dim > TQ_BK ) dim = TQ_BK ;
499499
@@ -517,7 +517,7 @@ void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache,
517517 }
518518
519519 for (int seq = 0 ; seq < seq_len ; seq ++ ) {
520- const block_tq_turbo_kv_4b * block = & blocks [seq ];
520+ const block_tq_turbo_kv_4b * block = & blocks_4b [seq ];
521521 float norm = tkv_fp16_to_fp32 (block -> norm );
522522 float r_norm = tkv_fp16_to_fp32 (block -> residual_norm );
523523
@@ -604,3 +604,172 @@ void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache,
604604 scores [seq ] = norm * mse_dot + norm * qjl_correction ;
605605 }
606606}
607+
608+ /* ============================================================
609+ * TurboQuant KV 1-bit: quantize
610+ *
611+ * Extreme compression: normalize -> RHT -> sign extraction.
612+ * Each dimension is stored as a single sign bit.
613+ * For dim=128: 24 bytes total (8 header + 16 sign bytes).
614+ * Compression ratio: 128*4 / 24 = 21.3x vs FP32.
615+ * ============================================================ */
616+
617+ void tq_turbo_kv_1b_quantize_ref (const float * src , void * dst , int n ) {
618+ block_tq_turbo_kv_1b * block = (block_tq_turbo_kv_1b * )dst ;
619+ int dim = n ;
620+ if (dim > TQ_BK ) dim = TQ_BK ;
621+
622+ /* Step 1: Compute L2 norm */
623+ float norm_sq = 0.0f ;
624+ for (int i = 0 ; i < dim ; i ++ ) {
625+ norm_sq += src [i ] * src [i ];
626+ }
627+ float norm = sqrtf (norm_sq );
628+ block -> norm = tkv_fp32_to_fp16 (norm );
629+ block -> _pad = 0 ;
630+
631+ /* Step 2: Normalize and copy to working buffer */
632+ float rotated [TQ_BK ];
633+ float inv_norm = (norm > 1e-10f ) ? (1.0f / norm ) : 0.0f ;
634+ for (int i = 0 ; i < dim ; i ++ ) {
635+ rotated [i ] = src [i ] * inv_norm ;
636+ }
637+ for (int i = dim ; i < TQ_BK ; i ++ ) {
638+ rotated [i ] = 0.0f ;
639+ }
640+
641+ /* Step 3: Apply RHT (in-place on rotated) */
642+ uint32_t seed = TKV_DEFAULT_SEED ;
643+ block -> rht_seed = seed ;
644+ tq_rht_transform (rotated , dim , seed );
645+
646+ /* Step 4: Extract sign bits -- 1 bit per dimension */
647+ int sign_bytes = dim / 8 ;
648+ memset (block -> signs , 0 , (size_t )sign_bytes );
649+ for (int i = 0 ; i < dim ; i ++ ) {
650+ if (rotated [i ] >= 0.0f ) {
651+ block -> signs [i / 8 ] |= (uint8_t )(1 << (i % 8 ));
652+ }
653+ }
654+ }
655+
656+ /* ============================================================
657+ * TurboQuant KV 1-bit: dequantize (rough reconstruction)
658+ *
659+ * Reconstruct: sign * (norm / sqrt(dim)) then inverse RHT.
660+ * This is a very rough reconstruction -- the real value of 1-bit
661+ * is in Hamming attention, not point-wise dequant.
662+ * ============================================================ */
663+
664+ void tq_turbo_kv_1b_dequantize_ref (const void * src , float * dst , int n ) {
665+ const block_tq_turbo_kv_1b * block = (const block_tq_turbo_kv_1b * )src ;
666+ int dim = n ;
667+ if (dim > TQ_BK ) dim = TQ_BK ;
668+
669+ float norm = tkv_fp16_to_fp32 (block -> norm );
670+ uint32_t seed = block -> rht_seed ;
671+
672+ /* Reconstruct sign vector in rotated space.
673+ * After RHT, coordinates are ~N(0, 1/sqrt(dim)).
674+ * Expected |x| for half-normal = sqrt(2/pi) * sigma = sqrt(2/pi) / sqrt(dim).
675+ * So sign * sqrt(2/pi) / sqrt(dim) is the expected reconstruction. */
676+ float scale = sqrtf (2.0f / TQ_PI ) / sqrtf ((float )dim );
677+ float rotated [TQ_BK ];
678+ for (int i = 0 ; i < dim ; i ++ ) {
679+ int bit = (block -> signs [i / 8 ] >> (i % 8 )) & 1 ;
680+ rotated [i ] = bit ? scale : - scale ;
681+ }
682+
683+ /* Inverse RHT */
684+ tq_rht_inverse (rotated , dim , seed );
685+
686+ /* Scale by original norm */
687+ for (int i = 0 ; i < dim ; i ++ ) {
688+ dst [i ] = rotated [i ] * norm ;
689+ }
690+ }
691+
692+ /* ============================================================
693+ * TurboQuant KV 1-bit: attention (XOR + popcount Hamming)
694+ *
695+ * Ultra-fast attention using bitwise operations:
696+ * 1. RHT(query) computed ONCE
697+ * 2. Extract query sign bits ONCE
698+ * 3. Per key: XOR + popcount -> Hamming distance -> score
699+ *
700+ * The inner product estimator:
701+ * <q, k> ~ q_norm * k_norm * sqrt(pi/2) / dim * (2*agree - dim)
702+ * where agree = dim - hamming_distance(q_signs, k_signs).
703+ *
704+ * NEON vectorization for popcount with scalar fallback.
705+ * ============================================================ */
706+
707+ void tq_turbo_kv_1b_attention_ref (const float * query , const void * kv_cache ,
708+ float * scores , int seq_len , int head_dim ) {
709+ const block_tq_turbo_kv_1b * blocks = (const block_tq_turbo_kv_1b * )kv_cache ;
710+ int dim = head_dim ;
711+ if (dim > TQ_BK ) dim = TQ_BK ;
712+
713+ float scale_factor = sqrtf (TQ_PI_2 ) / (float )dim ;
714+
715+ /* Step 1: RHT(query) computed ONCE */
716+ float q_rot [TQ_BK ];
717+ memcpy (q_rot , query , (size_t )dim * sizeof (float ));
718+ for (int i = dim ; i < TQ_BK ; i ++ ) q_rot [i ] = 0.0f ;
719+ tq_rht_transform (q_rot , dim , TKV_DEFAULT_SEED );
720+
721+ /* Step 2: Compute query L2 norm */
722+ float q_norm_sq = 0.0f ;
723+ for (int i = 0 ; i < dim ; i ++ ) {
724+ q_norm_sq += query [i ] * query [i ];
725+ }
726+ float q_norm = sqrtf (q_norm_sq );
727+
728+ /* Step 3: Extract query sign bits */
729+ int sign_bytes = dim / 8 ;
730+ uint8_t q_signs [TQ_BK / 8 ];
731+ memset (q_signs , 0 , (size_t )sign_bytes );
732+ for (int i = 0 ; i < dim ; i ++ ) {
733+ if (q_rot [i ] >= 0.0f ) {
734+ q_signs [i / 8 ] |= (uint8_t )(1 << (i % 8 ));
735+ }
736+ }
737+
738+ /* Step 4: Per-key Hamming attention */
739+ for (int seq = 0 ; seq < seq_len ; seq ++ ) {
740+ const block_tq_turbo_kv_1b * blk = & blocks [seq ];
741+ float k_norm = tkv_fp16_to_fp32 (blk -> norm );
742+
743+ /* XOR + popcount to get Hamming distance */
744+ int hamming = 0 ;
745+ #ifdef __ARM_NEON
746+ if (sign_bytes == 16 ) {
747+ /* Optimized path for dim=128 (16 sign bytes) */
748+ uint8x16_t vq = vld1q_u8 (q_signs );
749+ uint8x16_t vk = vld1q_u8 (blk -> signs );
750+ uint8x16_t vxor = veorq_u8 (vq , vk );
751+ /* Count bits: use NEON vcntq_u8 for byte-level popcount */
752+ uint8x16_t vcnt = vcntq_u8 (vxor );
753+ /* Horizontal sum of all byte popcounts */
754+ hamming = vaddlvq_u8 (vcnt );
755+ } else {
756+ for (int b = 0 ; b < sign_bytes ; b ++ ) {
757+ uint8_t xor_byte = q_signs [b ] ^ blk -> signs [b ];
758+ hamming += __builtin_popcount (xor_byte );
759+ }
760+ }
761+ #else
762+ for (int b = 0 ; b < sign_bytes ; b ++ ) {
763+ uint8_t xor_byte = q_signs [b ] ^ blk -> signs [b ];
764+ /* Portable popcount using Kernighan's bit trick */
765+ int c = 0 ;
766+ while (xor_byte ) { c ++ ; xor_byte &= xor_byte - 1 ; }
767+ hamming += c ;
768+ }
769+ #endif
770+
771+ int agree = dim - hamming ;
772+ float score = q_norm * k_norm * scale_factor * (float )(2 * agree - dim );
773+ scores [seq ] = score ;
774+ }
775+ }
0 commit comments