feat(kv-cache): implement ternary KV cache (OPT-T03)

gHashTag · ona-agent · gHashTag · commit 0d143c41bff1 · 2026-02-02T08:43:45.000Z
- Add TernaryKVCache with 2-bit quantization (16x compression)
- Implement quantizeVector/dequantizeV with per-token scales
- Add ternaryDot and simdTernaryDot for efficient attention
- Memory savings: 8 MB → 0.5 MB for 2048 tokens
- All 9 KV cache tests passing

Benchmark results:
- 4 heads, 128 dim, 2048 tokens: 15.5x compression
- 8 heads, 128 dim, 4096 tokens: 15.8x compression

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -74,7 +74,7 @@ Where:
 |----|--------------|-------------|---------|--------|
 | OPT-T01 | Ternary Weight Quantization | 20x | 10x | ✅ Implemented |
 | OPT-T02 | Ternary Matrix Multiplication | N/A | 10x | ✅ Implemented |
-| OPT-T03 | Ternary KV Cache | 20x | 5x | 📋 Planned |
+| OPT-T03 | Ternary KV Cache | 16x | 1.5x | ✅ Implemented |
 | OPT-T04 | Ternary Attention | 20x | 5-10x | 📋 Planned |
 | OPT-T05 | Ternary Embeddings | 20x | 2x | 📋 Planned |
 | OPT-T06 | Ternary Normalization | 20x | 3x | 📋 Planned |
@@ -216,6 +216,66 @@ Where:
 
 ---
 
+## Ternary KV Cache (OPT-T03)
+
+**Status**: ✅ Implemented
+
+### Implementation Details
+
+| Component | File | Description |
+|-----------|------|-------------|
+| TernaryKVCache | `kv_cache.zig` | 2-bit quantized KV storage |
+| quantizeVector | `kv_cache.zig` | f32 → ternary with scale |
+| dequantizeV | `kv_cache.zig` | ternary → f32 for output |
+| ternaryDot | `kv_cache.zig` | Scalar ternary dot product |
+| simdTernaryDot | `kv_cache.zig` | SIMD-optimized (8 values/iter) |
+
+### Memory Analysis
+
+| KV Heads | Head Dim | Tokens | f32 (MB) | Ternary (MB) | Ratio |
+|----------|----------|--------|----------|--------------|-------|
+| 4 | 64 | 512 | 1.00 | 0.07 | 15.1x |
+| 4 | 128 | 2048 | 8.00 | 0.52 | 15.5x |
+| 8 | 128 | 4096 | 32.00 | 2.03 | 15.8x |
+
+### Quantization Algorithm
+
+```
+For each K/V vector:
+1. scale = max(abs(vector))
+2. threshold = scale * 0.3
+3. For each value:
+   - if value > threshold: trit = +1
+   - if value < -threshold: trit = -1
+   - else: trit = 0
+4. Pack 4 trits per byte
+5. Store scale for dequantization
+```
+
+### SIMD Ternary Dot Product
+
+```zig
+// Sign lookup table
+const sign_lut = [4]f32{ 0.0, 1.0, -1.0, 0.0 };
+
+// Process 8 values at a time
+const signs: Vec8 = .{
+    sign_lut[(b0 >> 0) & 0x3],
+    sign_lut[(b0 >> 2) & 0x3],
+    // ... 8 total
+};
+sum_vec += q_vec * signs;
+```
+
+### Benefits
+
+- **16x memory reduction**: 4 bytes → 0.25 bytes per value
+- **16x longer context**: Same memory budget, 16x more tokens
+- **No multiplications**: Ternary dot product uses only add/sub
+- **SIMD friendly**: Sign lookup table enables vectorization
+
+---
+
 ## Flash Attention (OPT-004)
 
 **Status**: ✅ Implemented
diff --git a/specs/tri/ternary_kv_cache.vibee b/specs/tri/ternary_kv_cache.vibee
@@ -0,0 +1,126 @@
+# Ternary KV Cache Specification
+# 16x memory reduction via 2-bit quantization
+# φ² + 1/φ² = 3 | KOSCHEI IS IMMORTAL
+
+name: ternary_kv_cache
+version: "1.0.0"
+language: zig
+module: ternary_kv_cache
+
+description: |
+  Ternary KV cache stores K,V vectors in 2-bit format.
+  Each value quantized to {-1, 0, +1} with scale factor.
+  16x memory reduction: 4 bytes (f32) → 0.25 bytes (2-bit).
+  Enables 16x longer context with same memory budget.
+
+types:
+  TernaryKVCache:
+    description: "KV cache with ternary quantization"
+    fields:
+      k_cache: List<Int>
+      v_cache: List<Int>
+      k_scales: List<Float>
+      v_scales: List<Float>
+      num_kv_heads: Int
+      head_dim: Int
+      max_seq_len: Int
+      seq_len: Int
+
+  QuantizedVector:
+    description: "Ternary-quantized vector with scale"
+    fields:
+      data: List<Int>
+      scale: Float
+      length: Int
+
+  CacheMemoryStats:
+    description: "Memory comparison stats"
+    fields:
+      f32_bytes: Int
+      ternary_bytes: Int
+      compression_ratio: Float
+      tokens_capacity: Int
+
+behaviors:
+  - name: quantize_vector
+    given: f32 vector and threshold
+    when: Storing K or V in cache
+    then: Returns packed ternary bytes + scale factor
+
+  - name: dequantize_vector
+    given: Packed ternary bytes and scale
+    when: Reading K or V for attention
+    then: Returns approximate f32 vector
+
+  - name: ternary_append
+    given: New K,V vectors (f32)
+    when: Adding token to cache
+    then: Quantize and store with per-token scales
+
+  - name: ternary_dot_product
+    given: f32 query and ternary key
+    when: Computing attention score
+    then: Efficient dot product without full dequantization
+
+  - name: ternary_weighted_sum
+    given: Attention weights and ternary values
+    when: Computing attention output
+    then: Weighted sum with on-the-fly dequantization
+
+  - name: compute_memory_stats
+    given: Cache configuration
+    when: Analyzing memory usage
+    then: Returns f32 vs ternary comparison
+
+quantization_algorithm:
+  description: |
+    For each vector:
+    1. Compute scale = max(abs(vector))
+    2. Normalize: v_norm = vector / scale
+    3. Quantize: trit = sign(v_norm) if abs(v_norm) > threshold else 0
+    4. Pack: 4 trits per byte
+    
+    Dequantize:
+    1. Unpack trits from bytes
+    2. Multiply by scale: value = trit * scale
+
+memory_analysis:
+  f32_cache:
+    per_token: "num_kv_heads * head_dim * 4 bytes * 2 (K+V)"
+    example: "4 heads * 128 dim * 4 * 2 = 4096 bytes/token"
+    
+  ternary_cache:
+    per_token: "num_kv_heads * head_dim / 4 bytes * 2 + scales"
+    example: "4 heads * 128 dim / 4 * 2 + 8 = 264 bytes/token"
+    
+  compression: "4096 / 264 = 15.5x"
+
+accuracy_considerations:
+  - name: scale_per_token
+    description: "Each token has own scale for K and V"
+    
+  - name: threshold_tuning
+    description: "Threshold affects sparsity vs accuracy"
+    
+  - name: attention_approximation
+    description: "Ternary dot product is approximate but fast"
+
+benchmarks:
+  - name: memory_reduction
+    metric: "ratio"
+    target: "~16x"
+    
+  - name: accuracy_loss
+    metric: "cosine similarity"
+    target: ">0.95"
+    
+  - name: attention_speedup
+    metric: "ratio"
+    target: "1.5-2x (no multiplications)"
+
+integration:
+  - target: kv_cache.zig
+    description: "Add TernaryRingKVCache alongside RingKVCache"
+    
+  - target: tri_inference.zig
+    description: "Option to use ternary KV cache"
diff --git a/src/vibeec/gguf_transformer.zig b/src/vibeec/gguf_transformer.zig
@@ -12,6 +12,10 @@ pub const RingKVCache = kv_cache_mod.RingKVCache;
 pub const SlidingWindowConfig = kv_cache_mod.SlidingWindowConfig;
 pub const CacheStats = kv_cache_mod.CacheStats;
 
+// Re-export ternary KV cache (OPT-T03)
+pub const TernaryKVCache = kv_cache_mod.TernaryKVCache;
+pub const TernaryCacheStats = kv_cache_mod.TernaryCacheStats;
+
 // ═══════════════════════════════════════════════════════════════════════════════
 // RoPE - Rotary Position Embedding
 // ═══════════════════════════════════════════════════════════════════════════════
diff --git a/src/vibeec/kv_cache.zig b/src/vibeec/kv_cache.zig