feat(cache): implement KV cache compression with sliding window (OPT-C01)

gHashTag · ona-agent · gHashTag · commit 96849d95a30e · 2026-02-02T10:18:57.000Z
- Add streamingAttention with sliding window mask
- Add CompressionStats for monitoring cache efficiency
- Integrate streaming attention into tri_inference.zig
- Attention sink: keep first N tokens (default 4)
- Local window: keep last M tokens (configurable)
- Benchmark: 5x compression (500 tokens → 100 in cache)
- Memory savings: 16x for 32K context with 2K window

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -80,6 +80,7 @@ Where:
 | OPT-T06 | Ternary Normalization | 16x | 0.2x | ✅ Implemented |
 | OPT-T07 | Batch Ternary MatMul | N/A | 2.28x | ✅ Implemented |
 | OPT-M01 | Memory-Mapped Loading | N/A | 30x load | ✅ Implemented |
+| OPT-C01 | KV Cache Compression | 5-16x | 1x | ✅ Implemented |
 
 ### Business Value
 
@@ -446,6 +447,66 @@ var reader = try gguf.GGUFReader.init(allocator, "model.gguf");
 var reader = try gguf.MmapGGUFReader.init(allocator, "model.gguf");
 ```
 
+### KV Cache Compression (OPT-C01)
+
+**Status**: ✅ Implemented
+
+| Component | File | Description |
+|-----------|------|-------------|
+| SlidingWindowConfig | `kv_cache.zig` | Window size + sink tokens config |
+| RingKVCache | `kv_cache.zig` | Ring buffer with O(1) append |
+| streamingAttention | `kv_cache.zig` | Masked attention for sliding window |
+| CompressionStats | `kv_cache.zig` | Compression statistics |
+
+**Sliding Window + Attention Sink:**
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    CONTEXT WINDOW                           │
+├─────────────────────────────────────────────────────────────┤
+│  [SINK]  [EVICTED...]  [LOCAL WINDOW]                       │
+│  ┌───┐   ┌───────────┐ ┌─────────────────────────────────┐  │
+│  │ 4 │   │  MASKED   │ │        RECENT TOKENS            │  │
+│  │tok│   │  (-inf)   │ │        (attend here)            │  │
+│  └───┘   └───────────┘ └─────────────────────────────────┘  │
+│    ↑                          ↑                             │
+│  Always                    Sliding                          │
+│  kept                      window                           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Benchmark Results (500 tokens, window=100):**
+```
+╔══════════════════════════════════════════════════════════════╗
+║           KV CACHE COMPRESSION STATS                        ║
+╠══════════════════════════════════════════════════════════════╣
+║  Total tokens seen:           500                            ║
+║  Tokens in cache:             100                            ║
+║  Evicted tokens:              400                            ║
+║  Compression ratio:           5.0x                          ║
+║  Memory saved:             819200 bytes                      ║
+╚══════════════════════════════════════════════════════════════╝
+```
+
+**Memory Comparison (32K context, 2K window):**
+- Standard: 32K × head_dim × 2 × layers × heads
+- Streaming: 2K × head_dim × 2 × layers × heads
+- **Savings: 16x memory reduction**
+
+**Usage:**
+```zig
+// Configure sliding window
+const config = SlidingWindowConfig{
+    .window_size = 2048,
+    .sink_tokens = 4,      // Keep first 4 tokens
+    .local_tokens = 2044,  // Keep last 2044 tokens
+};
+
+var cache = try RingKVCache.init(allocator, num_heads, head_dim, 2048, config);
+
+// Streaming attention automatically masks evicted tokens
+kv_cache.streamingAttention(output, query, &cache, head_idx, scores, scale);
+```
+
 ### Batch Processing (INF-004)
 
 **Status**: ✅ Implemented
diff --git a/specs/tri/kv_cache_compression.vibee b/specs/tri/kv_cache_compression.vibee
@@ -0,0 +1,71 @@
+# kv_cache_compression.vibee
+# KV Cache Compression with Sliding Window + Attention Sink
+# Enables infinite context with fixed memory
+
+name: kv_cache_compression
+version: "1.0.0"
+language: zig
+module: kv_cache_compression
+
+types:
+  StreamingConfig:
+    description: "Configuration for streaming/infinite context"
+    fields:
+      window_size: Int        # Total window size (sink + local)
+      sink_tokens: Int        # First N tokens always kept
+      local_tokens: Int       # Recent tokens in sliding window
+      use_sparse_attention: Bool  # Apply window mask to attention
+
+  CompressionStats:
+    description: "Statistics for cache compression"
+    fields:
+      total_tokens_seen: Int
+      tokens_in_cache: Int
+      evicted_tokens: Int
+      compression_ratio: Float
+      memory_saved_bytes: Int
+
+behaviors:
+  - name: apply_window_mask
+    given: attention scores, window mask
+    when: computing masked attention
+    then: sets out-of-window scores to -inf before softmax
+
+  - name: streaming_attention
+    given: query, RingKVCache, window config
+    when: computing attention with sliding window
+    then: only attends to sink tokens + local window
+
+  - name: get_compression_stats
+    given: RingKVCache
+    when: querying compression efficiency
+    then: returns stats including memory saved
+
+  - name: configure_streaming
+    given: model, StreamingConfig
+    when: enabling streaming mode
+    then: configures all layer caches for sliding window
+
+# Algorithm:
+#
+# Standard Attention (O(N²) memory):
+#   scores = Q @ K^T  (all N tokens)
+#   output = softmax(scores) @ V
+#
+# Streaming Attention (O(W) memory, W = window_size):
+#   For each query position:
+#     1. Compute scores for sink tokens (first S)
+#     2. Compute scores for local window (last L)
+#     3. Mask out evicted tokens (set to -inf)
+#     4. Softmax over valid positions only
+#     5. Weighted sum of V
+#
+# Memory Comparison (context_length=32K, window=2K):
+#   Standard: 32K × head_dim × 2 (K+V) × num_layers × num_heads
+#   Streaming: 2K × head_dim × 2 (K+V) × num_layers × num_heads
+#   Savings: 16x memory reduction!
+#
+# Attention Sink Insight:
+#   First few tokens accumulate attention mass during training.
+#   Keeping them prevents attention collapse on long sequences.
+#   Typically 4 sink tokens is sufficient.
diff --git a/src/vibeec/kv_cache.zig b/src/vibeec/kv_cache.zig
@@ -541,6 +541,117 @@ pub const CacheStats = struct {
     memory_bytes: usize,
 };
 
+// ═══════════════════════════════════════════════════════════════════════════════
+// STREAMING ATTENTION (Sliding Window + Attention Sink)
+// Enables infinite context with fixed memory
+// ═══════════════════════════════════════════════════════════════════════════════
+
+/// Compute streaming attention with sliding window mask
+/// Only attends to sink tokens + local window, ignoring evicted tokens
+pub fn streamingAttention(
+    output: []f32,
+    query: []const f32,
+    cache: *const RingKVCache,
+    head_idx: usize,
+    scores_buf: []f32,
+    scale: f32,
+) void {
+    const seq_len = cache.seqLen();
+    const head_dim = cache.head_dim;
+
+    if (seq_len == 0) {
+        @memset(output, 0.0);
+        return;
+    }
+
+    // Compute attention scores with window masking
+    var max_score: f32 = -std.math.inf(f32);
+
+    for (0..seq_len) |t| {
+        // Get logical position for window check
+        const logical_pos = if (cache.total_tokens <= cache.max_seq_len)
+            t
+        else
+            cache.total_tokens - cache.max_seq_len + t;
+
+        // Check if position is in window (sink or local)
+        const in_window = cache.isInWindow(logical_pos);
+
+        if (in_window) {
+            // Compute dot product
+            const k_vec = cache.getK(t, head_idx);
+            var dot: f32 = 0.0;
+            for (0..head_dim) |j| {
+                dot += query[j] * k_vec[j];
+            }
+            scores_buf[t] = dot * scale;
+            if (scores_buf[t] > max_score) max_score = scores_buf[t];
+        } else {
+            // Mask out evicted tokens
+            scores_buf[t] = -std.math.inf(f32);
+        }
+    }
+
+    // Softmax (numerically stable)
+    var sum_exp: f32 = 0.0;
+    for (0..seq_len) |t| {
+        if (scores_buf[t] > -std.math.inf(f32)) {
+            scores_buf[t] = @exp(scores_buf[t] - max_score);
+            sum_exp += scores_buf[t];
+        } else {
+            scores_buf[t] = 0.0;
+        }
+    }
+
+    if (sum_exp > 0.0) {
+        for (0..seq_len) |t| {
+            scores_buf[t] /= sum_exp;
+        }
+    }
+
+    // Weighted sum of V
+    @memset(output, 0.0);
+    for (0..seq_len) |t| {
+        if (scores_buf[t] > 0.0) {
+            const v_vec = cache.getV(t, head_idx);
+            const score_val = scores_buf[t];
+            for (0..head_dim) |j| {
+                output[j] += score_val * v_vec[j];
+            }
+        }
+    }
+}
+
+/// Compression statistics for streaming mode
+pub const CompressionStats = struct {
+    total_tokens_seen: usize,
+    tokens_in_cache: usize,
+    evicted_tokens: usize,
+    compression_ratio: f32,
+    memory_saved_bytes: usize,
+    effective_context: usize, // sink + local window
+
+    pub fn fromCache(cache: *const RingKVCache) CompressionStats {
+        const cfg = cache.window_config;
+        const effective = @min(cache.total_tokens, cfg.sink_tokens + cfg.local_tokens);
+        const full_memory = cache.total_tokens * cache.num_kv_heads * cache.head_dim * 2 * @sizeOf(f32);
+        const actual_memory = cache.memoryUsage();
+        const saved = if (full_memory > actual_memory) full_memory - actual_memory else 0;
+
+        return CompressionStats{
+            .total_tokens_seen = cache.total_tokens,
+            .tokens_in_cache = cache.seqLen(),
+            .evicted_tokens = cache.evicted_tokens,
+            .compression_ratio = if (cache.total_tokens > 0)
+                @as(f32, @floatFromInt(cache.total_tokens)) / @as(f32, @floatFromInt(cache.seqLen()))
+            else
+                1.0,
+            .memory_saved_bytes = saved,
+            .effective_context = effective,
+        };
+    }
+};
+
 // ═══════════════════════════════════════════════════════════════════════════════
 // TERNARY KV-CACHE (OPT-T03)
 // 16x memory reduction via 2-bit quantization
@@ -1328,3 +1439,88 @@ test "batch kv cache" {
     try std.testing.expect(seq2 != null);
     try std.testing.expectEqual(@as(usize, 2), batch.activeCount());
 }
+
+test "streaming_attention_window" {
+    const allocator = std.testing.allocator;
+
+    // Create cache with small window for testing
+    const window_config = SlidingWindowConfig{
+        .window_size = 16,
+        .sink_tokens = 2, // Keep first 2 tokens
+        .local_tokens = 6, // Keep last 6 tokens
+    };
+
+    var cache = try RingKVCache.init(allocator, 1, 4, 16, window_config);
+    defer cache.deinit();
+
+    // Add 20 tokens (exceeds window)
+    for (0..20) |i| {
+        var k = [_]f32{ @floatFromInt(i), 0, 0, 0 };
+        var v = [_]f32{ 1, 0, 0, 0 };
+        cache.append(&k, &v);
+    }
+
+    // Check window membership
+    // Sink tokens (0, 1) should be in window
+    try std.testing.expect(cache.isInWindow(0));
+    try std.testing.expect(cache.isInWindow(1));
+
+    // Middle tokens should be evicted
+    try std.testing.expect(!cache.isInWindow(5));
+    try std.testing.expect(!cache.isInWindow(10));
+
+    // Recent tokens (14-19) should be in window
+    try std.testing.expect(cache.isInWindow(14));
+    try std.testing.expect(cache.isInWindow(19));
+
+    // Test streaming attention
+    const query = [_]f32{ 1, 0, 0, 0 };
+    var output: [4]f32 = undefined;
+    var scores: [16]f32 = undefined;
+
+    streamingAttention(&output, &query, &cache, 0, &scores, 1.0);
+
+    // Output should be non-zero (attention computed)
+    try std.testing.expect(output[0] != 0.0);
+}
+
+test "compression_stats" {
+    const allocator = std.testing.allocator;
+
+    const window_config = SlidingWindowConfig{
+        .window_size = 100,
+        .sink_tokens = 4,
+        .local_tokens = 96,
+    };
+
+    var cache = try RingKVCache.init(allocator, 4, 64, 100, window_config);
+    defer cache.deinit();
+
+    // Simulate long sequence
+    var k_buf: [256]f32 = undefined;
+    var v_buf: [256]f32 = undefined;
+    @memset(&k_buf, 0.1);
+    @memset(&v_buf, 0.2);
+
+    for (0..500) |_| {
+        cache.append(&k_buf, &v_buf);
+    }
+
+    const stats = CompressionStats.fromCache(&cache);
+
+    try std.testing.expectEqual(@as(usize, 500), stats.total_tokens_seen);
+    try std.testing.expectEqual(@as(usize, 100), stats.tokens_in_cache);
+    try std.testing.expectEqual(@as(usize, 400), stats.evicted_tokens);
+    try std.testing.expect(stats.compression_ratio >= 4.9); // 500/100 = 5x
+
+    std.debug.print("\n╔══════════════════════════════════════════════════════════════╗\n", .{});
+    std.debug.print("║           KV CACHE COMPRESSION STATS                        ║\n", .{});
+    std.debug.print("╠══════════════════════════════════════════════════════════════╣\n", .{});
+    std.debug.print("║  Total tokens seen:    {d:>10}                            ║\n", .{stats.total_tokens_seen});
+    std.debug.print("║  Tokens in cache:      {d:>10}                            ║\n", .{stats.tokens_in_cache});
+    std.debug.print("║  Evicted tokens:       {d:>10}                            ║\n", .{stats.evicted_tokens});
+    std.debug.print("║  Compression ratio:    {d:>10.1}x                          ║\n", .{stats.compression_ratio});
+    std.debug.print("║  Effective context:    {d:>10}                            ║\n", .{stats.effective_context});
+    std.debug.print("║  Memory saved:         {d:>10} bytes                      ║\n", .{stats.memory_saved_bytes});
+    std.debug.print("╚══════════════════════════════════════════════════════════════╝\n", .{});
+}
diff --git a/src/vibeec/tri_inference.zig b/src/vibeec/tri_inference.zig
@@ -757,27 +757,18 @@ pub const BatchTriModel = struct {
         for (0..num_heads) |h| {
             const kv_h = h / kv_group_size;
             const q_head = model.buf_q[h * head_dim ..][0..head_dim];
-
-            // Compute attention scores using RingKVCache
-            for (0..seq_len) |t| {
-                const k_vec = cache.getK(t, kv_h);
-                model.buf_scores[t] = flash.simdDot(q_head, k_vec) * scale;
-            }
-
-            // Softmax
-            inference.softmax(model.buf_scores[0..seq_len], model.buf_scores[0..seq_len]);
-
-            // Weighted sum
             const out_head = model.buf_attn_out[h * head_dim ..][0..head_dim];
-            @memset(out_head, 0.0);
 
-            for (0..seq_len) |t| {
-                const v_vec = cache.getV(t, kv_h);
-                const score_val = model.buf_scores[t];
-                for (0..head_dim) |j| {
-                    out_head[j] += score_val * v_vec[j];
-                }
-            }
+            // Use streaming attention with sliding window mask
+            // This enables infinite context with fixed memory
+            kv_cache.streamingAttention(
+                out_head,
+                q_head,
+                cache,
+                kv_h,
+                model.buf_scores[0..seq_len],
+                scale,
+            );
         }
 
         // Output projection