SKaiNET-developers
diff --git a/‎kllama-enterprise.md‎
Lines changed: 16 additions & 3 deletions b/‎kllama-enterprise.md‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎quant_format.md‎
Lines changed: 378 additions & 0 deletions b/‎quant_format.md‎
Lines changed: 378 additions & 0 deletions
diff --git a/‎skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/llama/LlamaWeightLoader.kt‎
Lines changed: 122 additions & 1 deletion b/‎skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/llama/LlamaWeightLoader.kt‎
Lines changed: 122 additions & 1 deletion
diff --git a/‎skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/llama/LlamaQuantDequantTest.kt‎
Lines changed: 120 additions & 0 deletions b/‎skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/llama/LlamaQuantDequantTest.kt‎
Lines changed: 120 additions & 0 deletions
@@ -29,7 +29,7 @@ KLlama is a Kotlin Multiplatform LLM inference runtime. This document outlines t
 | iOS/Android native | ✅ | ❌ | Via bindings |
 | Browser (Wasm) | ✅ | ❌ | Via bindings |
 | Quantized inference | 🚧 Planned | ✅ | ✅ |
-| **BitNet/Ternary native** | 🚧 Planned | ❌ | Partial |
+| **BitNet/Ternary native** | ✅ TQ1_0/TQ2_0 dequant + ternary matmul | ❌ | Partial |
 | SIMD optimization | Partial | ✅ | ✅ |
 | Memory-mapped I/O | ✅ (JVM) | ✅ | ✅ |
 | Multiple architectures | ❌ | ✅ | ✅ |
@@ -467,7 +467,7 @@ class MappedGGUFReader(path: Path) {
 
 **Impact**: Enable 7B, 13B, 70B models without OOM
 
-### 1.2 BitNet / Ternary Quantization Support 🆕 HIGH PRIORITY
+### 1.2 BitNet / Ternary Quantization Support ✅ IMPLEMENTED
 
 Native support for Microsoft's BitNet 1.58-bit models with ternary weights {-1, 0, +1}.
 
@@ -477,6 +477,14 @@ Native support for Microsoft's BitNet 1.58-bit models with ternary weights {-1,
 - Unique differentiator (most frameworks don't have native ternary kernels)
 - We already have `Ternary` DType and `DenseTernaryTensorArray`
 
+**What's Implemented:**
+- ✅ TQ1_0 dequantization (base-3 packed ternary format, ~1.69 bpw)
+- ✅ TQ2_0 dequantization (2-bit packed ternary format, ~2.06 bpw)
+- ✅ `Ternary2BitTensorData` - compact storage with TQ format encoding
+- ✅ `TernaryMatmul.matmul()` - addition-only kernel (no FP multiply)
+- ✅ `matmulAutoDispatch()` - automatic ternary detection and dispatch
+- ✅ Comprehensive unit tests for all components
+
 **Architecture Integration:**
 ```
 ┌─────────────────────────────────────────────────────────────────┐
@@ -542,13 +550,18 @@ fun matmulTernarySIMD(input: FloatArray, weights: TernaryTensorData): FloatArray
 }
 ```
 
-**Existing Foundation:**
+**Implementation Status:**
 | Component | Status | Location |
 |-----------|--------|----------|
 | `Ternary` DType | ✅ | `skainet-lang-core/.../types/Ternary.kt` |
 | `DenseTernaryTensorArray` | ✅ | `skainet-lang-core/.../data/dense/` |
 | GGUF TQ1_0/TQ2_0 enum | ✅ | `GGMLQuantizationType` |
 | Type promotion | ✅ | Ternary → Int8 → FP32 |
+| `dequantTQ1_0()` | ✅ | `LlamaWeightLoader.kt` |
+| `dequantTQ2_0()` | ✅ | `LlamaWeightLoader.kt` |
+| `Ternary2BitTensorData` | ✅ | `skainet-lang-core/.../data/TernaryTensorData.kt` |
+| `TernaryMatmul` | ✅ | `skainet-lang-core/.../ops/TernaryMatmul.kt` |
+| Unit tests | ✅ | `LlamaQuantDequantTest`, `TernaryTensorDataTest`, `TernaryMatmulTest` |
 
 **Impact**:
 - **Speed**: 5-10x faster than FP32 (no FP multiply, integer add only)
 
@@ -597,6 +597,123 @@ public class LlamaWeightLoader(
             }
             return out
         }
+
+        /**
+         * Dequantize TQ2_0 (Ternary 2-bit) format to FP32.
+         *
+         * TQ2_0 layout per block (256 elements, 66 bytes):
+         * - 64 bytes: quantized data (4 ternary values per byte, 2-bit each)
+         * - 2 bytes: f16 scale
+         *
+         * Values encoded as {0, 1, 2} represent {-1, 0, +1}.
+         * Dequantization: output[i] = (ternary[i] - 1) * scale
+         */
+        internal fun dequantTQ2_0(raw: List<Any>, nElems: Int): FloatArray {
+            val bytes = toByteArray(raw, "TQ2_0")
+            val blockSize = 256
+            val bytesPerBlock = 66 // 64 (qs) + 2 (f16 scale)
+            val blockCount = bytes.size / bytesPerBlock
+            val out = FloatArray(blockCount * blockSize)
+            var offset = 0
+            var outOff = 0
+
+            repeat(blockCount) {
+                // Read quantized values first (64 bytes = 256 values at 2-bit each)
+                val qs = bytes.copyOfRange(offset, offset + 64)
+                offset += 64
+
+                // Read f16 scale (last 2 bytes)
+                val scale = halfToFloat(
+                    (bytes[offset + 1].toInt() and 0xFF shl 8) or (bytes[offset].toInt() and 0xFF)
+                )
+                offset += 2
+
+                // Decode 2-bit values: 4 values per byte
+                // Bit layout: [v3:v2:v1:v0] where each vN is 2 bits
+                for (i in 0 until 64) {
+                    val b = qs[i].toInt() and 0xFF
+                    val v0 = (b and 0x03) - 1         // bits 0-1
+                    val v1 = ((b shr 2) and 0x03) - 1 // bits 2-3
+                    val v2 = ((b shr 4) and 0x03) - 1 // bits 4-5
+                    val v3 = ((b shr 6) and 0x03) - 1 // bits 6-7
+
+                    out[outOff + i * 4 + 0] = v0 * scale
+                    out[outOff + i * 4 + 1] = v1 * scale
+                    out[outOff + i * 4 + 2] = v2 * scale
+                    out[outOff + i * 4 + 3] = v3 * scale
+                }
+                outOff += blockSize
+            }
+            return out
+        }
+
+        /**
+         * Dequantize TQ1_0 (Ternary base-3) format to FP32.
+         *
+         * TQ1_0 layout per block (256 elements, 54 bytes):
+         * - 48 bytes: base-3 packed data (5 values per byte, 240 elements total)
+         * - 4 bytes: 2-bit packed for remaining 16 elements
+         * - 2 bytes: f16 scale
+         *
+         * Base-3 encoding: 5 ternary values packed into one byte (3^5 = 243 < 256).
+         * Values {0, 1, 2} represent {-1, 0, +1}.
+         * Dequantization: output[i] = (ternary[i] - 1) * scale
+         */
+        internal fun dequantTQ1_0(raw: List<Any>, nElems: Int): FloatArray {
+            val bytes = toByteArray(raw, "TQ1_0")
+            val blockSize = 256
+            val bytesPerBlock = 54 // 48 (base-3) + 4 (2-bit) + 2 (f16 scale)
+            val blockCount = bytes.size / bytesPerBlock
+            val out = FloatArray(blockCount * blockSize)
+            var offset = 0
+            var outOff = 0
+
+            repeat(blockCount) {
+                // Read base-3 packed data (48 bytes = 240 elements)
+                val qsBase3 = bytes.copyOfRange(offset, offset + 48)
+                offset += 48
+
+                // Read 2-bit packed data for remaining 16 elements (4 bytes)
+                val qs2bit = bytes.copyOfRange(offset, offset + 4)
+                offset += 4
+
+                // Read f16 scale
+                val scale = halfToFloat(
+                    (bytes[offset + 1].toInt() and 0xFF shl 8) or (bytes[offset].toInt() and 0xFF)
+                )
+                offset += 2
+
+                // Decode base-3 packed values (5 values per byte)
+                // Each byte b encodes: v0 + v1*3 + v2*9 + v3*27 + v4*81
+                var outIdx = 0
+                for (i in 0 until 48) {
+                    var b = qsBase3[i].toInt() and 0xFF
+                    repeat(5) {
+                        val v = (b % 3) - 1  // Extract value and convert to {-1, 0, +1}
+                        out[outOff + outIdx] = v * scale
+                        outIdx++
+                        b /= 3
+                    }
+                }
+
+                // Decode remaining 16 elements from 2-bit packing (4 bytes)
+                for (i in 0 until 4) {
+                    val b = qs2bit[i].toInt() and 0xFF
+                    val v0 = (b and 0x03) - 1
+                    val v1 = ((b shr 2) and 0x03) - 1
+                    val v2 = ((b shr 4) and 0x03) - 1
+                    val v3 = ((b shr 6) and 0x03) - 1
+
+                    out[outOff + 240 + i * 4 + 0] = v0 * scale
+                    out[outOff + 240 + i * 4 + 1] = v1 * scale
+                    out[outOff + 240 + i * 4 + 2] = v2 * scale
+                    out[outOff + 240 + i * 4 + 3] = v3 * scale
+                }
+
+                outOff += blockSize
+            }
+            return out
+        }
     }
 
     /**
@@ -922,7 +1039,9 @@ public class LlamaWeightLoader(
             GGMLQuantizationType.Q6_K,
             GGMLQuantizationType.Q8_K,
             GGMLQuantizationType.IQ4_NL,
-            GGMLQuantizationType.IQ4_XS -> {
+            GGMLQuantizationType.IQ4_XS,
+            GGMLQuantizationType.TQ1_0,
+            GGMLQuantizationType.TQ2_0 -> {
                 when (quantPolicy) {
                     QuantPolicy.RAW_BYTES -> {
                         require(dtype == Int8::class) {
@@ -954,6 +1073,8 @@ public class LlamaWeightLoader(
                             GGMLQuantizationType.Q8_K -> dequantQ8K(raw, rt.nElements)
                             GGMLQuantizationType.IQ4_NL -> dequantIQ4NL(raw, rt.nElements)
                             GGMLQuantizationType.IQ4_XS -> dequantIQ4XS(raw, rt.nElements)
+                            GGMLQuantizationType.TQ1_0 -> dequantTQ1_0(raw, rt.nElements)
+                            GGMLQuantizationType.TQ2_0 -> dequantTQ2_0(raw, rt.nElements)
                             else -> error("Dequantization for ${rt.tensorType} not implemented yet")
                         }
                         @Suppress("UNCHECKED_CAST")
 
@@ -190,4 +190,124 @@ class LlamaQuantDequantTest {
         }
         assertContentEquals(expected.toList(), out.toList())
     }
+
+    @Test
+    fun `dequant TQ2_0 block with scale 1 and all zeros yields minus ones`() {
+        // TQ2_0: 66 bytes = 64 data + 2 f16 scale
+        // All data bytes = 0x00 -> each 2-bit value is 0 -> (0-1) = -1
+        // Scale = 1.0 (0x3C00)
+        val raw = ByteArray(66) { 0x00 }
+        raw[64] = 0x00  // scale low byte
+        raw[65] = 0x3C  // scale high byte (f16 1.0)
+        val out = LlamaWeightLoader.dequantTQ2_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { -1f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ2_0 block with all ones yields zeros`() {
+        // All data bytes = 0x55 -> each 2-bit value is 1 (01 01 01 01) -> (1-1) = 0
+        // Scale = 1.0
+        val raw = ByteArray(66) { 0x55 }
+        raw[64] = 0x00; raw[65] = 0x3C
+        val out = LlamaWeightLoader.dequantTQ2_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 0f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ2_0 block with all twos yields plus ones`() {
+        // All data bytes = 0xAA -> each 2-bit value is 2 (10 10 10 10) -> (2-1) = +1
+        // Scale = 1.0
+        val raw = ByteArray(66) { 0xAA.toByte() }
+        raw[64] = 0x00; raw[65] = 0x3C
+        val out = LlamaWeightLoader.dequantTQ2_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 1f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ2_0 block applies scale correctly`() {
+        // All twos (+1) with scale = 2.0 (0x4000)
+        val raw = ByteArray(66) { 0xAA.toByte() }
+        raw[64] = 0x00; raw[65] = 0x40  // f16 2.0
+        val out = LlamaWeightLoader.dequantTQ2_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 2f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ2_0 block with mixed values`() {
+        // First byte = 0xE4 = 11 10 01 00 in binary
+        // Values: v0=0 (-1), v1=1 (0), v2=2 (+1), v3=3 -> but 3 is invalid, should be clamped to +2
+        // Actually TQ2_0 only uses values 0,1,2. If we see 3, (3-1)=2
+        val raw = ByteArray(66) { 0x55 }  // default to zeros
+        raw[0] = 0xE4.toByte()  // 11_10_01_00: v0=-1, v1=0, v2=+1, v3=+2 (if 3 is allowed)
+        raw[64] = 0x00; raw[65] = 0x3C  // scale = 1.0
+        val out = LlamaWeightLoader.dequantTQ2_0(raw.toList(), 256)
+        // First 4 elements: (0-1)=-1, (1-1)=0, (2-1)=+1, (3-1)=+2
+        kotlin.test.assertEquals(-1f, out[0], 0.001f)
+        kotlin.test.assertEquals(0f, out[1], 0.001f)
+        kotlin.test.assertEquals(1f, out[2], 0.001f)
+        kotlin.test.assertEquals(2f, out[3], 0.001f)  // 3 encodes as +2 when scaled
+    }
+
+    @Test
+    fun `dequant TQ1_0 block with all zeros yields minus ones`() {
+        // TQ1_0: 54 bytes = 48 base-3 + 4 2-bit + 2 f16 scale
+        // All base-3 bytes = 0 means each decoded value is 0 -> (0-1) = -1
+        // All 2-bit bytes = 0 means remaining 16 values are also -1
+        val raw = ByteArray(54) { 0x00 }
+        raw[52] = 0x00; raw[53] = 0x3C  // scale = 1.0
+        val out = LlamaWeightLoader.dequantTQ1_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { -1f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ1_0 block with base3 ones yields zeros`() {
+        // Base-3 encoding: each byte encodes 5 values as v0 + v1*3 + v2*9 + v3*27 + v4*81
+        // For all ones: 1 + 3 + 9 + 27 + 81 = 121 (0x79)
+        // 2-bit packed: 0x55 = 01 01 01 01 = all ones
+        val raw = ByteArray(54) { 0x00 }
+        repeat(48) { raw[it] = 0x79 }  // base-3 all ones
+        repeat(4) { raw[48 + it] = 0x55 }  // 2-bit all ones
+        raw[52] = 0x00; raw[53] = 0x3C  // scale = 1.0
+        val out = LlamaWeightLoader.dequantTQ1_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 0f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ1_0 block with base3 twos yields plus ones`() {
+        // For all twos: 2 + 6 + 18 + 54 + 162 = 242 (0xF2)
+        // 2-bit packed: 0xAA = 10 10 10 10 = all twos
+        val raw = ByteArray(54) { 0x00 }
+        repeat(48) { raw[it] = 0xF2.toByte() }  // base-3 all twos
+        repeat(4) { raw[48 + it] = 0xAA.toByte() }  // 2-bit all twos
+        raw[52] = 0x00; raw[53] = 0x3C  // scale = 1.0
+        val out = LlamaWeightLoader.dequantTQ1_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 1f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ1_0 block applies scale correctly`() {
+        // All twos with scale = 2.0
+        val raw = ByteArray(54) { 0x00 }
+        repeat(48) { raw[it] = 0xF2.toByte() }  // base-3 all twos
+        repeat(4) { raw[48 + it] = 0xAA.toByte() }  // 2-bit all twos
+        raw[52] = 0x00; raw[53] = 0x40  // scale = 2.0
+        val out = LlamaWeightLoader.dequantTQ1_0(raw.toList(), 256)
+        assertContentEquals(FloatArray(256) { 2f }.toList(), out.toList())
+    }
+
+    @Test
+    fun `dequant TQ1_0 base3 decoding for mixed values`() {
+        // Test decoding first 5 values from one base-3 byte
+        // Values: 0, 1, 2, 0, 1 -> 0 + 1*3 + 2*9 + 0*27 + 1*81 = 3 + 18 + 81 = 102 (0x66)
+        val raw = ByteArray(54) { 0x79 }  // default all ones
+        raw[0] = 0x66  // first 5 values: -1, 0, +1, -1, 0
+        repeat(4) { raw[48 + it] = 0x55 }  // 2-bit all ones
+        raw[52] = 0x00; raw[53] = 0x3C  // scale = 1.0
+        val out = LlamaWeightLoader.dequantTQ1_0(raw.toList(), 256)
+        kotlin.test.assertEquals(-1f, out[0], 0.001f)
+        kotlin.test.assertEquals(0f, out[1], 0.001f)
+        kotlin.test.assertEquals(1f, out[2], 0.001f)
+        kotlin.test.assertEquals(-1f, out[3], 0.001f)
+        kotlin.test.assertEquals(0f, out[4], 0.001f)
+    }
 }