Skip to content

Commit c17bada

Browse files
michalharakalclaude
andcommitted
feat(q4_0): FP32→Q4_0 quantizer (loader-agnostic production)
Adds Q4_0Quantizer in commonMain — the produce side Q4_0 was missing (it was decode-only, since GGUF arrives pre-quantized). Now any source of dense FP32 weights — a SafeTensors/JSON loader, an in-memory tensor, an offline tool — can emit canonical ggml Q4_0 blocks without GGUF. Algorithm matches ggml quantize_row_q4_0: per 32-element block, scale d = max/-8 (max = signed max-magnitude element), code = clamp(round( x/d + 8), 0, 15), packed in the canonical split layout; scale stored as round-to-nearest FP16. Tests: - Q4_0QuantizerTest — round-trips through Q4_0TensorData.toFloatArray within 4-bit error, recovers the max element, zero stays zero. - Q4_0QuantizeRoundTripMatmulTest — quantized weights run through the matmul dispatch and track the dense FP32 result, proving the quantizer output is consumable by the (scalar/Panama/native) kernels. Note: automatic on-load quantization via a loader policy is deliberately NOT wired here. DTypePolicy targets logical DType, not TensorEncoding, so requesting "Q4_0" needs a new encoding-policy type — an RFC-level API decision (parallel to #615) the maintainer should own. This PR ships the reusable primitive every such path would call. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent db60ab9 commit c17bada

4 files changed

Lines changed: 277 additions & 0 deletions

File tree

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package sk.ainet.exec.tensor.ops
2+
3+
import kotlin.math.abs
4+
import kotlin.random.Random
5+
import kotlin.test.Test
6+
import kotlin.test.assertTrue
7+
import sk.ainet.context.DirectCpuExecutionContext
8+
import sk.ainet.lang.tensor.Shape
9+
import sk.ainet.lang.tensor.Tensor
10+
import sk.ainet.lang.tensor.data.Q4_0Quantizer
11+
import sk.ainet.lang.tensor.data.TensorData
12+
import sk.ainet.lang.types.FP32
13+
14+
/**
15+
* End-to-end proof that the [Q4_0Quantizer] (FP32 → Q4_0) output is
16+
* directly consumable by the matmul dispatch — i.e. *any* loader that
17+
* produces dense FP32 weights can quantize them to Q4_0 and run
18+
* inference through the same kernel path GGUF Q4_0 weights use.
19+
*
20+
* Quantizes a dense weight, runs `ctx.ops.matmul(x, qWeight)`, and
21+
* checks it tracks the dense FP32 matmul within 4-bit error.
22+
*/
23+
class Q4_0QuantizeRoundTripMatmulTest {
24+
25+
private val ctx = DirectCpuExecutionContext()
26+
27+
@Suppress("UNCHECKED_CAST")
28+
private fun assertQuantizedTracksDense(inputDim: Int, outputDim: Int, seed: Int) {
29+
val rng = Random(seed)
30+
// Logical weight W[o][j] (output o, input j).
31+
val w = Array(outputDim) { FloatArray(inputDim) { rng.nextFloat() - 0.5f } }
32+
val inputV = FloatArray(inputDim) { rng.nextFloat() - 0.5f }
33+
34+
// Reference: plain FP32 matmul.
35+
val expected = FloatArray(outputDim)
36+
for (o in 0 until outputDim) {
37+
var acc = 0f
38+
for (j in 0 until inputDim) acc += inputV[j] * w[o][j]
39+
expected[o] = acc
40+
}
41+
42+
// Arrange weights in the kernel's packed block order — block
43+
// (blockIdx, o) holds the 32 input positions [blockIdx*32 .. +31]
44+
// for output o — then quantize that flat array. This is the layout
45+
// a loader producing Q4_0 matmul weights must emit.
46+
val blocks = inputDim / 32
47+
val flat = FloatArray(inputDim * outputDim)
48+
var p = 0
49+
for (blockIdx in 0 until blocks) {
50+
for (o in 0 until outputDim) {
51+
for (k in 0 until 32) {
52+
flat[p++] = w[o][blockIdx * 32 + k]
53+
}
54+
}
55+
}
56+
val qData = Q4_0Quantizer.quantize(flat, Shape(inputDim, outputDim))
57+
val weight: Tensor<FP32, Float> = ctx.fromData(qData as TensorData<FP32, Float>, FP32::class)
58+
val input = ctx.fromFloatArray<FP32, Float>(Shape(1, inputDim), FP32::class, inputV)
59+
60+
val out = ctx.ops.matmul(input, weight).data.copyToFloatArray()
61+
62+
// Q4_0 quantization error per weight is ~step/2 (step ≈ |max|/8 per
63+
// block); the dot-product error over `inputDim` random-signed terms
64+
// grows ~√blocks, not linearly. Tolerance scales accordingly.
65+
val tol = 0.1f + 0.1f * (inputDim / 32).coerceAtLeast(1)
66+
for (o in 0 until outputDim) {
67+
val diff = abs(expected[o] - out[o])
68+
assertTrue(
69+
diff <= tol,
70+
"quantized matmul drifted at $o: dense=${expected[o]} q4_0=${out[o]} diff=$diff tol=$tol",
71+
)
72+
}
73+
}
74+
75+
@Test fun single_output_tracks_dense() =
76+
assertQuantizedTracksDense(inputDim = 64, outputDim = 1, seed = 1)
77+
78+
@Test fun attention_proj_shape_tracks_dense() =
79+
assertQuantizedTracksDense(inputDim = 128, outputDim = 128, seed = 2)
80+
}

skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3056,6 +3056,12 @@ public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion {
30563056
public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
30573057
}
30583058

3059+
public final class sk/ainet/lang/tensor/data/Q4_0Quantizer {
3060+
public static final field INSTANCE Lsk/ainet/lang/tensor/data/Q4_0Quantizer;
3061+
public final fun quantize ([FLsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
3062+
public final fun quantizeToBytes ([F)[B
3063+
}
3064+
30593065
public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData {
30603066
public static final field BLOCK_SIZE I
30613067
public static final field BYTES_PER_BLOCK I
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
package sk.ainet.lang.tensor.data
2+
3+
import sk.ainet.lang.tensor.Shape
4+
import kotlin.math.abs
5+
6+
/**
7+
* FP32 → Q4_0 quantizer — the loader-agnostic counterpart to
8+
* [Q4_0TensorData]'s decode side.
9+
*
10+
* Q4_0 was decode-only until now (GGUF files arrive pre-quantized).
11+
* This makes Q4_0 *producible* from dense FP32 in pure `commonMain`, so
12+
* any source — a SafeTensors / JSON loader that only carries dense
13+
* weights, an in-memory tensor, an offline packing tool — can emit
14+
* canonical ggml Q4_0 blocks without going through GGUF.
15+
*
16+
* Algorithm (per 32-element block, matching ggml `quantize_row_q4_0`):
17+
* 1. Find the element of greatest magnitude `max` (sign preserved).
18+
* 2. `d = max / -8` so the most-negative code (0 → `-8`) recovers it;
19+
* store `d` as the block's FP16 scale.
20+
* 3. Each element: `code = clamp(round(x / d + 8), 0, 15)`, packed in
21+
* the canonical split layout (low nibbles → elements 0..15, high →
22+
* 16..31).
23+
*
24+
* Round-trips through [Q4_0TensorData.toFloatArray] within 4-bit
25+
* quantization error.
26+
*/
27+
public object Q4_0Quantizer {
28+
29+
private const val BLOCK_SIZE = 32
30+
private const val BYTES_PER_BLOCK = 18
31+
32+
/**
33+
* Quantize [values] (length must be a multiple of 32) into packed
34+
* Q4_0 bytes — `18 * (values.size / 32)` bytes.
35+
*/
36+
public fun quantizeToBytes(values: FloatArray): ByteArray {
37+
require(values.size % BLOCK_SIZE == 0) {
38+
"Q4_0 quantization requires a length that is a multiple of $BLOCK_SIZE; got ${values.size}"
39+
}
40+
val blocks = values.size / BLOCK_SIZE
41+
val out = ByteArray(blocks * BYTES_PER_BLOCK)
42+
43+
for (b in 0 until blocks) {
44+
val base = b * BLOCK_SIZE
45+
// 1. Max-magnitude value, sign preserved.
46+
var amax = 0f
47+
var max = 0f
48+
for (i in 0 until BLOCK_SIZE) {
49+
val v = values[base + i]
50+
val a = abs(v)
51+
if (a > amax) {
52+
amax = a
53+
max = v
54+
}
55+
}
56+
val d = max / -8f
57+
val id = if (d != 0f) 1f / d else 0f
58+
59+
val outBase = b * BYTES_PER_BLOCK
60+
// FP16 scale, little-endian.
61+
val half = floatToHalf(d)
62+
out[outBase] = (half and 0xFF).toByte()
63+
out[outBase + 1] = ((half ushr 8) and 0xFF).toByte()
64+
65+
// 2. Codes, split layout: byte j packs element j (low) and j+16 (high).
66+
for (j in 0 until 16) {
67+
val lo = quantCode(values[base + j], id)
68+
val hi = quantCode(values[base + 16 + j], id)
69+
out[outBase + 2 + j] = ((hi shl 4) or lo).toByte()
70+
}
71+
}
72+
return out
73+
}
74+
75+
/**
76+
* Quantize [values] into a [Q4_0BlockTensorData] with logical
77+
* [shape] (`shape.volume` must equal `values.size` and be a
78+
* multiple of 32).
79+
*/
80+
public fun quantize(values: FloatArray, shape: Shape): Q4_0BlockTensorData {
81+
require(shape.volume == values.size) {
82+
"shape volume ${shape.volume} must equal values length ${values.size}"
83+
}
84+
return Q4_0BlockTensorData(shape, quantizeToBytes(values))
85+
}
86+
87+
private fun quantCode(x: Float, id: Float): Int {
88+
// ggml: (int)(x * id + 8.5f), clamped to [0, 15].
89+
val q = (x * id + 8.5f).toInt()
90+
return if (q < 0) 0 else if (q > 15) 15 else q
91+
}
92+
93+
/** Round-to-nearest FP32 → FP16 bits. */
94+
private fun floatToHalf(value: Float): Int {
95+
val bits = value.toRawBits()
96+
val sign = (bits ushr 16) and 0x8000
97+
var exp = ((bits ushr 23) and 0xFF) - 127 + 15
98+
val mant = bits and 0x7FFFFF
99+
return when {
100+
exp >= 0x1F -> sign or 0x7C00 // overflow → ±inf
101+
exp <= 0 -> {
102+
// Subnormal / underflow to zero (scales here are well within
103+
// normal FP16 range, so this branch is the safe floor).
104+
if (exp < -10) {
105+
sign
106+
} else {
107+
val m = (mant or 0x800000) ushr (1 - exp + 13)
108+
sign or m
109+
}
110+
}
111+
else -> {
112+
// Round to nearest, ties to even.
113+
val half = sign or (exp shl 10) or (mant ushr 13)
114+
val roundBit = (mant ushr 12) and 1
115+
half + roundBit
116+
}
117+
}
118+
}
119+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package sk.ainet.lang.tensor.data
2+
3+
import sk.ainet.lang.tensor.Shape
4+
import kotlin.math.abs
5+
import kotlin.random.Random
6+
import kotlin.test.Test
7+
import kotlin.test.assertEquals
8+
import kotlin.test.assertFailsWith
9+
import kotlin.test.assertTrue
10+
11+
class Q4_0QuantizerTest {
12+
13+
@Test
14+
fun `quantizeToBytes produces 18 bytes per 32-element block`() {
15+
val bytes = Q4_0Quantizer.quantizeToBytes(FloatArray(64) { 0.1f * it })
16+
assertEquals(2 * 18, bytes.size)
17+
}
18+
19+
@Test
20+
fun `rejects non-block-aligned length`() {
21+
assertFailsWith<IllegalArgumentException> {
22+
Q4_0Quantizer.quantizeToBytes(FloatArray(31))
23+
}
24+
}
25+
26+
@Test
27+
fun `quantize then dequantize round-trips within 4-bit error`() {
28+
val rng = Random(7)
29+
val n = 32 * 8
30+
val values = FloatArray(n) { (rng.nextFloat() - 0.5f) * 4f }
31+
val q = Q4_0Quantizer.quantize(values, Shape(n))
32+
val back = q.toFloatArray()
33+
34+
// Per block, max-magnitude sets the step ≈ |max| / 8. Allow ~1 step.
35+
for (b in 0 until n / 32) {
36+
var amax = 0f
37+
for (i in 0 until 32) amax = maxOf(amax, abs(values[b * 32 + i]))
38+
val step = amax / 8f
39+
for (i in 0 until 32) {
40+
val idx = b * 32 + i
41+
val diff = abs(values[idx] - back[idx])
42+
assertTrue(
43+
diff <= step + 1e-4f,
44+
"round-trip error at $idx: orig=${values[idx]} back=${back[idx]} diff=$diff step=$step",
45+
)
46+
}
47+
}
48+
}
49+
50+
@Test
51+
fun `recovers the max-magnitude element closely`() {
52+
val values = FloatArray(32) { 0f }
53+
values[5] = -3.7f // dominant negative
54+
values[9] = 1.2f
55+
val back = Q4_0Quantizer.quantize(values, Shape(32)).toFloatArray()
56+
// d = max / -8 with max = -3.7 → the dominant element recovers near-exactly.
57+
assertEquals(-3.7f, back[5], 0.05f)
58+
}
59+
60+
@Test
61+
fun `all-zero block stays zero`() {
62+
val back = Q4_0Quantizer.quantize(FloatArray(32), Shape(32)).toFloatArray()
63+
for (v in back) assertEquals(0f, v, 1e-6f)
64+
}
65+
66+
@Test
67+
fun `quantize rejects shape volume mismatch`() {
68+
assertFailsWith<IllegalArgumentException> {
69+
Q4_0Quantizer.quantize(FloatArray(32), Shape(64))
70+
}
71+
}
72+
}

0 commit comments

Comments
 (0)