feat(kv-cache): implement ring buffer KV cache (INF-003)

gHashTag · ona-agent · gHashTag · commit 6f08e849c4b5 · 2026-02-02T08:23:39.000Z
- Add RingKVCache with O(1) append and fixed memory - Implement sliding window attention (sink tokens + local window) - Add SIMD-optimized cache copy using @vector(8, f32) - Add cache statistics (hit rate, eviction tracking) - Add prune() method for explicit memory management - Re-export optimized types via gguf_transformer.zig - All 7 KV cache tests passing Co-authored-by: Ona <no-reply@ona.com>
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -203,7 +203,7 @@ Where:
 
 ### Available (Next)
 
-- [ ] INF-003: KV Cache Optimization (+50% speed)
+- [x] INF-003: KV Cache Optimization (+50% speed) ✅ Implemented
 - [ ] INF-004: Batch Processing (+300% throughput)
 - [ ] OPT-001: SIMD Vectorization (+400% matrix ops)
 - [ ] OPT-004: Flash Attention (+200% attention)
@@ -216,6 +216,69 @@ Where:
 
 ---
 
+## KV Cache Optimization (INF-003)
+
+**Status**: ✅ Implemented
+
+### Implementation Details
+
+| Component | File | Description |
+|-----------|------|-------------|
+| RingKVCache | `kv_cache.zig` | O(1) append ring buffer |
+| SlidingWindowConfig | `kv_cache.zig` | Sink tokens + local window |
+| simdCopy | `kv_cache.zig` | SIMD-optimized cache writes |
+| CacheStats | `kv_cache.zig` | Hit rate, eviction tracking |
+
+### Ring Buffer Design
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    RING BUFFER KV CACHE                     │
+├─────────────────────────────────────────────────────────────┤
+│  [0] [1] [2] [3] [4] [5] [6] [7]  ← Physical positions      │
+│   ↑                                                         │
+│   write_pos (wraps around)                                  │
+│                                                             │
+│  Benefits:                                                  │
+│  - O(1) append (no reallocation)                            │
+│  - Fixed memory (max_seq_len * kv_size)                     │
+│  - Automatic eviction of oldest tokens                      │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Sliding Window Attention
+
+```
+Tokens:  [0] [1] [2] [3] ... [N-M] ... [N-1] [N]
+          ↑   ↑   ↑   ↑       ↑         ↑     ↑
+          └───┴───┴───┘       └─────────┴─────┘
+          Sink tokens (4)     Local window (M)
+          Always kept         Sliding window
+```
+
+### Memory Efficiency
+
+| Config | Tokens | Memory | vs Unbounded |
+|--------|--------|--------|--------------|
+| max_seq_len=2048 | 2048 | 16 MB | Fixed |
+| max_seq_len=4096 | 4096 | 32 MB | Fixed |
+| Unbounded | N | N * 8 KB | O(N) growth |
+
+### Test Results
+
+```
+All 7 tests passed:
+- kv cache config
+- layer kv cache
+- full kv cache
+- ring kv cache ✅ NEW
+- ring kv cache reset ✅ NEW
+- simd copy ✅ NEW
+- cached attention
+```
+
+---
+
 ## Ternary Matrix Multiplication (OPT-T02)
 
 **Status**: ✅ Implemented
diff --git a/specs/tri/kv_cache_optimized.vibee b/specs/tri/kv_cache_optimized.vibee
@@ -0,0 +1,124 @@
+# Optimized KV Cache Specification
+# Ring buffer with sliding window for infinite context
+# φ² + 1/φ² = 3 | KOSCHEI IS IMMORTAL
+
+name: kv_cache_optimized
+version: "1.0.0"
+language: zig
+module: kv_cache_optimized
+
+description: |
+  Optimized KV cache with ring buffer for O(1) append and fixed memory.
+  Supports sliding window attention for infinite context length.
+  SIMD-optimized copy operations.
+
+types:
+  RingKVCache:
+    description: "Ring buffer KV cache with fixed memory"
+    fields:
+      k_cache: List<Float>
+      v_cache: List<Float>
+      num_kv_heads: Int
+      head_dim: Int
+      max_seq_len: Int
+      write_pos: Int
+      total_tokens: Int
+
+  SlidingWindowConfig:
+    description: "Sliding window attention configuration"
+    fields:
+      window_size: Int
+      sink_tokens: Int
+      local_tokens: Int
+
+  CacheStats:
+    description: "Cache utilization statistics"
+    fields:
+      total_tokens: Int
+      cached_tokens: Int
+      evicted_tokens: Int
+      hit_rate: Float
+      memory_bytes: Int
+
+behaviors:
+  - name: ring_append
+    given: New K,V vectors and ring buffer cache
+    when: Appending new token to cache
+    then: O(1) write at write_pos, wrap around at max_seq_len
+
+  - name: ring_get_k
+    given: Ring buffer cache and position
+    when: Reading cached K vector
+    then: Returns K at (pos % max_seq_len) with bounds check
+
+  - name: ring_get_v
+    given: Ring buffer cache and position
+    when: Reading cached V vector
+    then: Returns V at (pos % max_seq_len) with bounds check
+
+  - name: sliding_window_mask
+    given: Current position and window config
+    when: Computing attention mask
+    then: Returns mask with sink tokens + local window
+
+  - name: simd_cache_copy
+    given: Source K,V vectors and cache destination
+    when: Copying to cache with SIMD
+    then: 4x faster copy using @Vector(8, f32)
+
+  - name: compute_cache_stats
+    given: Ring buffer cache state
+    when: Analyzing cache utilization
+    then: Returns hit rate, eviction count, memory usage
+
+  - name: prune_old_tokens
+    given: Cache with tokens beyond window
+    when: Memory pressure or explicit prune request
+    then: Evict oldest tokens outside sliding window
+
+  - name: reset_cache
+    given: Ring buffer cache
+    when: Starting new sequence
+    then: Reset write_pos and total_tokens to 0
+
+optimizations:
+  - name: ring_buffer
+    description: "O(1) append, fixed memory, no reallocation"
+    
+  - name: sliding_window
+    description: "Sink tokens (first N) + local window (last M)"
+    
+  - name: simd_copy
+    description: "@Vector(8, f32) for cache writes"
+    
+  - name: cache_aligned
+    description: "16-byte alignment for SIMD access"
+
+memory_layout:
+  - name: k_cache
+    format: "[max_seq_len][num_kv_heads][head_dim]"
+    alignment: 16
+    
+  - name: v_cache
+    format: "[max_seq_len][num_kv_heads][head_dim]"
+    alignment: 16
+
+benchmarks:
+  - name: append_latency
+    metric: "ns per token"
+    target: "<100ns"
+    
+  - name: memory_efficiency
+    metric: "bytes per token"
+    target: "2 * num_kv_heads * head_dim * sizeof(f32)"
+    
+  - name: cache_hit_rate
+    metric: "percentage"
+    target: ">95% for window_size tokens"
+
+integration:
+  - target: tri_inference.zig
+    description: "Replace KVCache with RingKVCache"
+    
+  - target: gguf_transformer.zig
+    description: "Update attention to use sliding window"
diff --git a/src/vibeec/gguf_transformer.zig b/src/vibeec/gguf_transformer.zig
@@ -5,6 +5,12 @@
 const std = @import("std");
 const gguf = @import("gguf_reader.zig");
 const inference = @import("gguf_inference.zig");
+const kv_cache_mod = @import("kv_cache.zig");
+
+// Re-export optimized KV cache types
+pub const RingKVCache = kv_cache_mod.RingKVCache;
+pub const SlidingWindowConfig = kv_cache_mod.SlidingWindowConfig;
+pub const CacheStats = kv_cache_mod.CacheStats;
 
 // ═══════════════════════════════════════════════════════════════════════════════
 // RoPE - Rotary Position Embedding
diff --git a/src/vibeec/kv_cache.zig b/src/vibeec/kv_cache.zig