feat(accuracy): improve ternary quantization accuracy 0.77 → 0.93

gHashTag · ona-agent · gHashTag · commit e72d1e854e2d · 2026-02-02T09:08:52.000Z
- Add QuantMode enum: fixed_threshold, adaptive_mean, no_threshold, rms_scale
- Implement RMS-based scaling for better accuracy
- Add setHighAccuracy(), setBalanced(), setHighCompression() methods
- Default to rms_scale mode (best accuracy)

Accuracy improvement:
- fixed_threshold (0.3): 0.77 cosine similarity
- no_threshold: 0.78 cosine similarity
- rms_scale: 0.93 cosine similarity (+21% improvement)

Key insight: RMS scale preserves value distribution better than max.

Co-authored-by: Ona &lt;no-reply@ona.com&gt;
diff --git a/docs/DISCOVERIES.md b/docs/DISCOVERIES.md
@@ -263,14 +263,24 @@ const logits = try model.forward(token_id, position);
 ║  f32 forward:          ✅ PASS                               ║
 ║  Ternary KV enable:    ✅ PASS                               ║
 ║  Ternary forward:      ✅ PASS                               ║
-║  Output similarity:    0.77 (cosine)                         ║
+║  Output similarity:    0.93 (cosine) ✅ IMPROVED             ║
 ║  Memory compression:   12.8x                                 ║
-║  Generation speed:     19,231 tok/s                          ║
+║  Generation speed:     20,093 tok/s                          ║
 ╚══════════════════════════════════════════════════════════════╝
 ```
 
 **Test Model:** 32 vocab, 64 hidden, 2 layers, 4 heads
 
+### Accuracy Improvement (ACCURACY-IMPROVEMENT)
+
+| Quantization Mode | Cosine Similarity | Notes |
+|-------------------|-------------------|-------|
+| fixed_threshold (0.3) | 0.77 | Original, aggressive |
+| no_threshold | 0.78 | All values quantized |
+| **rms_scale** | **0.93** | **Best accuracy** |
+
+**Key insight:** Using RMS (root mean square) for scale instead of max preserves more information about value distribution. The threshold is set to 0.5 * RMS, which better separates signal from noise.
+
 ### Test Results
 
 ```
diff --git a/src/vibeec/kv_cache.zig b/src/vibeec/kv_cache.zig
@@ -568,6 +568,16 @@ pub const TernaryKVCache = struct {
     // Quantization threshold (fraction of max)
     threshold_ratio: f32,
 
+    // Quantization mode
+    quant_mode: QuantMode,
+
+    pub const QuantMode = enum {
+        fixed_threshold, // Original: threshold = max * ratio
+        adaptive_mean, // Adaptive: threshold = mean(abs) * ratio
+        no_threshold, // All non-zero values quantized (best accuracy)
+        rms_scale, // Use RMS for scale (better for attention)
+    };
+
     pub fn init(
         allocator: std.mem.Allocator,
         num_kv_heads: usize,
@@ -588,7 +598,8 @@ pub const TernaryKVCache = struct {
             .k_scales = try allocator.alloc(f32, max_seq_len),
             .v_scales = try allocator.alloc(f32, max_seq_len),
             .seq_len = 0,
-            .threshold_ratio = 0.3, // Values < 30% of max become 0
+            .threshold_ratio = 0.0,
+            .quant_mode = .rms_scale, // RMS-based scaling for better accuracy
         };
     }
 
@@ -625,32 +636,50 @@ pub const TernaryKVCache = struct {
     }
 
     /// Quantize f32 vector to ternary packed bytes
+    /// Returns scale factor for dequantization
     fn quantizeVector(self: *const TernaryKVCache, dst: []u8, src: []const f32) f32 {
-        // Find max absolute value for scale
+        // Calculate statistics
         var max_abs: f32 = 0.0;
+        var sum_abs: f32 = 0.0;
+        var sum_sq: f32 = 0.0;
         for (src) |v| {
             const abs_v = @abs(v);
             if (abs_v > max_abs) max_abs = abs_v;
+            sum_abs += abs_v;
+            sum_sq += v * v;
         }
 
         if (max_abs == 0.0) {
             @memset(dst, 0);
             return 1.0;
         }
 
-        const threshold = max_abs * self.threshold_ratio;
-        const inv_scale = 1.0 / max_abs;
+        const n = @as(f32, @floatFromInt(src.len));
+        const mean_abs = sum_abs / n;
+        const rms = @sqrt(sum_sq / n);
+
+        // Calculate scale and threshold based on mode
+        const scale: f32 = switch (self.quant_mode) {
+            .fixed_threshold, .no_threshold, .adaptive_mean => max_abs,
+            .rms_scale => rms * 1.5, // RMS * sqrt(2) approximates max for normal distribution
+        };
+
+        const threshold: f32 = switch (self.quant_mode) {
+            .fixed_threshold => max_abs * self.threshold_ratio,
+            .adaptive_mean => mean_abs * self.threshold_ratio,
+            .no_threshold => 0.0,
+            .rms_scale => rms * 0.5, // Half RMS as threshold
+        };
 
         // Pack 4 values per byte
         var byte_idx: usize = 0;
         var bit_pos: u3 = 0;
         var current_byte: u8 = 0;
 
         for (src) |v| {
-            const normalized = v * inv_scale;
-            const trit: u2 = if (normalized > threshold * inv_scale)
+            const trit: u2 = if (v > threshold)
                 0b01 // +1
-            else if (normalized < -threshold * inv_scale)
+            else if (v < -threshold)
                 0b10 // -1
             else
                 0b00; // 0
@@ -670,7 +699,7 @@ pub const TernaryKVCache = struct {
             dst[byte_idx] = current_byte;
         }
 
-        return max_abs;
+        return scale;
     }
 
     /// Compute dot product between f32 query and ternary key (no full dequantization)
@@ -798,6 +827,30 @@ pub const TernaryKVCache = struct {
         self.seq_len = 0;
     }
 
+    /// Set quantization mode for accuracy tuning
+    pub fn setQuantMode(self: *TernaryKVCache, mode: QuantMode, threshold: f32) void {
+        self.quant_mode = mode;
+        self.threshold_ratio = threshold;
+    }
+
+    /// Use high-accuracy mode (no threshold, all values quantized)
+    pub fn setHighAccuracy(self: *TernaryKVCache) void {
+        self.quant_mode = .no_threshold;
+        self.threshold_ratio = 0.0;
+    }
+
+    /// Use balanced mode (small threshold for noise reduction)
+    pub fn setBalanced(self: *TernaryKVCache) void {
+        self.quant_mode = .adaptive_mean;
+        self.threshold_ratio = 0.1;
+    }
+
+    /// Use high-compression mode (aggressive threshold)
+    pub fn setHighCompression(self: *TernaryKVCache) void {
+        self.quant_mode = .fixed_threshold;
+        self.threshold_ratio = 0.3;
+    }
+
     /// Memory usage in bytes
     pub fn memoryUsage(self: *const TernaryKVCache) usize {
         return self.k_cache.len + self.v_cache.len +