feat(q4_0): FP32→Q4_0 quantizer (loader-agnostic production)

michalharakal · claude · michalharakal · commit c17bada52c82 · 2026-05-30T19:57:57.000+02:00
Adds Q4_0Quantizer in commonMain — the produce side Q4_0 was missing (it was decode-only, since GGUF arrives pre-quantized). Now any source of dense FP32 weights — a SafeTensors/JSON loader, an in-memory tensor, an offline tool — can emit canonical ggml Q4_0 blocks without GGUF. Algorithm matches ggml quantize_row_q4_0: per 32-element block, scale d = max/-8 (max = signed max-magnitude element), code = clamp(round( x/d + 8), 0, 15), packed in the canonical split layout; scale stored as round-to-nearest FP16. Tests: - Q4_0QuantizerTest — round-trips through Q4_0TensorData.toFloatArray within 4-bit error, recovers the max element, zero stays zero. - Q4_0QuantizeRoundTripMatmulTest — quantized weights run through the matmul dispatch and track the dense FP32 result, proving the quantizer output is consumable by the (scalar/Panama/native) kernels. Note: automatic on-load quantization via a loader policy is deliberately NOT wired here. DTypePolicy targets logical DType, not TensorEncoding, so requesting "Q4_0" needs a new encoding-policy type — an RFC-level API decision (parallel to #615) the maintainer should own. This PR ships the reusable primitive every such path would call. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0QuantizeRoundTripMatmulTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0QuantizeRoundTripMatmulTest.kt
@@ -0,0 +1,80 @@
+package sk.ainet.exec.tensor.ops
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertTrue
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.Q4_0Quantizer
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.FP32
+
+/**
+ * End-to-end proof that the [Q4_0Quantizer] (FP32 → Q4_0) output is
+ * directly consumable by the matmul dispatch — i.e. *any* loader that
+ * produces dense FP32 weights can quantize them to Q4_0 and run
+ * inference through the same kernel path GGUF Q4_0 weights use.
+ *
+ * Quantizes a dense weight, runs `ctx.ops.matmul(x, qWeight)`, and
+ * checks it tracks the dense FP32 matmul within 4-bit error.
+ */
+class Q4_0QuantizeRoundTripMatmulTest {
+
+    private val ctx = DirectCpuExecutionContext()
+
+    @Suppress("UNCHECKED_CAST")
+    private fun assertQuantizedTracksDense(inputDim: Int, outputDim: Int, seed: Int) {
+        val rng = Random(seed)
+        // Logical weight W[o][j] (output o, input j).
+        val w = Array(outputDim) { FloatArray(inputDim) { rng.nextFloat() - 0.5f } }
+        val inputV = FloatArray(inputDim) { rng.nextFloat() - 0.5f }
+
+        // Reference: plain FP32 matmul.
+        val expected = FloatArray(outputDim)
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (j in 0 until inputDim) acc += inputV[j] * w[o][j]
+            expected[o] = acc
+        }
+
+        // Arrange weights in the kernel's packed block order — block
+        // (blockIdx, o) holds the 32 input positions [blockIdx*32 .. +31]
+        // for output o — then quantize that flat array. This is the layout
+        // a loader producing Q4_0 matmul weights must emit.
+        val blocks = inputDim / 32
+        val flat = FloatArray(inputDim * outputDim)
+        var p = 0
+        for (blockIdx in 0 until blocks) {
+            for (o in 0 until outputDim) {
+                for (k in 0 until 32) {
+                    flat[p++] = w[o][blockIdx * 32 + k]
+                }
+            }
+        }
+        val qData = Q4_0Quantizer.quantize(flat, Shape(inputDim, outputDim))
+        val weight: Tensor<FP32, Float> = ctx.fromData(qData as TensorData<FP32, Float>, FP32::class)
+        val input = ctx.fromFloatArray<FP32, Float>(Shape(1, inputDim), FP32::class, inputV)
+
+        val out = ctx.ops.matmul(input, weight).data.copyToFloatArray()
+
+        // Q4_0 quantization error per weight is ~step/2 (step ≈ |max|/8 per
+        // block); the dot-product error over `inputDim` random-signed terms
+        // grows ~√blocks, not linearly. Tolerance scales accordingly.
+        val tol = 0.1f + 0.1f * (inputDim / 32).coerceAtLeast(1)
+        for (o in 0 until outputDim) {
+            val diff = abs(expected[o] - out[o])
+            assertTrue(
+                diff <= tol,
+                "quantized matmul drifted at $o: dense=${expected[o]} q4_0=${out[o]} diff=$diff tol=$tol",
+            )
+        }
+    }
+
+    @Test fun single_output_tracks_dense() =
+        assertQuantizedTracksDense(inputDim = 64, outputDim = 1, seed = 1)
+
+    @Test fun attention_proj_shape_tracks_dense() =
+        assertQuantizedTracksDense(inputDim = 128, outputDim = 128, seed = 2)
+}
diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api
@@ -3056,6 +3056,12 @@ public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion {
 	public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
 }
 
+public final class sk/ainet/lang/tensor/data/Q4_0Quantizer {
+	public static final field INSTANCE Lsk/ainet/lang/tensor/data/Q4_0Quantizer;
+	public final fun quantize ([FLsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
+	public final fun quantizeToBytes ([F)[B
+}
+
 public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData {
 	public static final field BLOCK_SIZE I
 	public static final field BYTES_PER_BLOCK I
diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0Quantizer.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0Quantizer.kt
@@ -0,0 +1,119 @@
+package sk.ainet.lang.tensor.data
+
+import sk.ainet.lang.tensor.Shape
+import kotlin.math.abs
+
+/**
+ * FP32 → Q4_0 quantizer — the loader-agnostic counterpart to
+ * [Q4_0TensorData]'s decode side.
+ *
+ * Q4_0 was decode-only until now (GGUF files arrive pre-quantized).
+ * This makes Q4_0 *producible* from dense FP32 in pure `commonMain`, so
+ * any source — a SafeTensors / JSON loader that only carries dense
+ * weights, an in-memory tensor, an offline packing tool — can emit
+ * canonical ggml Q4_0 blocks without going through GGUF.
+ *
+ * Algorithm (per 32-element block, matching ggml `quantize_row_q4_0`):
+ *  1. Find the element of greatest magnitude `max` (sign preserved).
+ *  2. `d = max / -8` so the most-negative code (0 → `-8`) recovers it;
+ *     store `d` as the block's FP16 scale.
+ *  3. Each element: `code = clamp(round(x / d + 8), 0, 15)`, packed in
+ *     the canonical split layout (low nibbles → elements 0..15, high →
+ *     16..31).
+ *
+ * Round-trips through [Q4_0TensorData.toFloatArray] within 4-bit
+ * quantization error.
+ */
+public object Q4_0Quantizer {
+
+    private const val BLOCK_SIZE = 32
+    private const val BYTES_PER_BLOCK = 18
+
+    /**
+     * Quantize [values] (length must be a multiple of 32) into packed
+     * Q4_0 bytes — `18 * (values.size / 32)` bytes.
+     */
+    public fun quantizeToBytes(values: FloatArray): ByteArray {
+        require(values.size % BLOCK_SIZE == 0) {
+            "Q4_0 quantization requires a length that is a multiple of $BLOCK_SIZE; got ${values.size}"
+        }
+        val blocks = values.size / BLOCK_SIZE
+        val out = ByteArray(blocks * BYTES_PER_BLOCK)
+
+        for (b in 0 until blocks) {
+            val base = b * BLOCK_SIZE
+            // 1. Max-magnitude value, sign preserved.
+            var amax = 0f
+            var max = 0f
+            for (i in 0 until BLOCK_SIZE) {
+                val v = values[base + i]
+                val a = abs(v)
+                if (a > amax) {
+                    amax = a
+                    max = v
+                }
+            }
+            val d = max / -8f
+            val id = if (d != 0f) 1f / d else 0f
+
+            val outBase = b * BYTES_PER_BLOCK
+            // FP16 scale, little-endian.
+            val half = floatToHalf(d)
+            out[outBase] = (half and 0xFF).toByte()
+            out[outBase + 1] = ((half ushr 8) and 0xFF).toByte()
+
+            // 2. Codes, split layout: byte j packs element j (low) and j+16 (high).
+            for (j in 0 until 16) {
+                val lo = quantCode(values[base + j], id)
+                val hi = quantCode(values[base + 16 + j], id)
+                out[outBase + 2 + j] = ((hi shl 4) or lo).toByte()
+            }
+        }
+        return out
+    }
+
+    /**
+     * Quantize [values] into a [Q4_0BlockTensorData] with logical
+     * [shape] (`shape.volume` must equal `values.size` and be a
+     * multiple of 32).
+     */
+    public fun quantize(values: FloatArray, shape: Shape): Q4_0BlockTensorData {
+        require(shape.volume == values.size) {
+            "shape volume ${shape.volume} must equal values length ${values.size}"
+        }
+        return Q4_0BlockTensorData(shape, quantizeToBytes(values))
+    }
+
+    private fun quantCode(x: Float, id: Float): Int {
+        // ggml: (int)(x * id + 8.5f), clamped to [0, 15].
+        val q = (x * id + 8.5f).toInt()
+        return if (q < 0) 0 else if (q > 15) 15 else q
+    }
+
+    /** Round-to-nearest FP32 → FP16 bits. */
+    private fun floatToHalf(value: Float): Int {
+        val bits = value.toRawBits()
+        val sign = (bits ushr 16) and 0x8000
+        var exp = ((bits ushr 23) and 0xFF) - 127 + 15
+        val mant = bits and 0x7FFFFF
+        return when {
+            exp >= 0x1F -> sign or 0x7C00 // overflow → ±inf
+            exp <= 0 -> {
+                // Subnormal / underflow to zero (scales here are well within
+                // normal FP16 range, so this branch is the safe floor).
+                if (exp < -10) {
+                    sign
+                } else {
+                    val m = (mant or 0x800000) ushr (1 - exp + 13)
+                    sign or m
+                }
+            }
+            else -> {
+                // Round to nearest, ties to even.
+                val half = sign or (exp shl 10) or (mant ushr 13)
+                val roundBit = (mant ushr 12) and 1
+                half + roundBit
+            }
+        }
+    }
+}
diff --git a/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0QuantizerTest.kt b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0QuantizerTest.kt
@@ -0,0 +1,72 @@
+package sk.ainet.lang.tensor.data
+
+import sk.ainet.lang.tensor.Shape
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFailsWith
+import kotlin.test.assertTrue
+
+class Q4_0QuantizerTest {
+
+    @Test
+    fun `quantizeToBytes produces 18 bytes per 32-element block`() {
+        val bytes = Q4_0Quantizer.quantizeToBytes(FloatArray(64) { 0.1f * it })
+        assertEquals(2 * 18, bytes.size)
+    }
+
+    @Test
+    fun `rejects non-block-aligned length`() {
+        assertFailsWith<IllegalArgumentException> {
+            Q4_0Quantizer.quantizeToBytes(FloatArray(31))
+        }
+    }
+
+    @Test
+    fun `quantize then dequantize round-trips within 4-bit error`() {
+        val rng = Random(7)
+        val n = 32 * 8
+        val values = FloatArray(n) { (rng.nextFloat() - 0.5f) * 4f }
+        val q = Q4_0Quantizer.quantize(values, Shape(n))
+        val back = q.toFloatArray()
+
+        // Per block, max-magnitude sets the step ≈ |max| / 8. Allow ~1 step.
+        for (b in 0 until n / 32) {
+            var amax = 0f
+            for (i in 0 until 32) amax = maxOf(amax, abs(values[b * 32 + i]))
+            val step = amax / 8f
+            for (i in 0 until 32) {
+                val idx = b * 32 + i
+                val diff = abs(values[idx] - back[idx])
+                assertTrue(
+                    diff <= step + 1e-4f,
+                    "round-trip error at $idx: orig=${values[idx]} back=${back[idx]} diff=$diff step=$step",
+                )
+            }
+        }
+    }
+
+    @Test
+    fun `recovers the max-magnitude element closely`() {
+        val values = FloatArray(32) { 0f }
+        values[5] = -3.7f   // dominant negative
+        values[9] = 1.2f
+        val back = Q4_0Quantizer.quantize(values, Shape(32)).toFloatArray()
+        // d = max / -8 with max = -3.7 → the dominant element recovers near-exactly.
+        assertEquals(-3.7f, back[5], 0.05f)
+    }
+
+    @Test
+    fun `all-zero block stays zero`() {
+        val back = Q4_0Quantizer.quantize(FloatArray(32), Shape(32)).toFloatArray()
+        for (v in back) assertEquals(0f, v, 1e-6f)
+    }
+
+    @Test
+    fun `quantize rejects shape volume mismatch`() {
+        assertFailsWith<IllegalArgumentException> {
+            Q4_0Quantizer.quantize(FloatArray(32), Shape(64))
+        }
+    }
+}