@@ -5,6 +5,7 @@ import sk.ainet.lang.tensor.Shape
55import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
66import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
77import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
8+ import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
89import sk.ainet.lang.tensor.data.TensorData
910import sk.ainet.lang.types.DType
1011
@@ -66,8 +67,8 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
6667 bytes : ByteArray ,
6768 shape : Shape ,
6869 bytesPerBlock : Int ,
70+ blockSize : Int = 256,
6971): ByteArray {
70- val blockSize = 256
7172 require(shape.rank == 2 ) { " K-series weight must be 2D, got rank ${shape.rank} " }
7273 val outDim = shape[0 ]
7374 val inDim = shape[1 ]
@@ -88,19 +89,31 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
8889 return out
8990}
9091
91- /* * Bytes per ggml block for the K-quant types this packer handles. */
92- private fun kQuantBytesPerBlock (qt : GGMLQuantizationType ): Int? = when (qt) {
93- GGMLQuantizationType .Q4_K -> 144
94- GGMLQuantizationType .Q5_K -> 176
95- GGMLQuantizationType .Q6_K -> 210
92+ /* *
93+ * Block geometry `(blockElems, bytesPerBlock)` for the quant types this packer
94+ * handles. The K-series are 256-element super-blocks; Q8_0 is a 32-element block
95+ * (f16 scale + 32 int8). All four have a first-class CPU matmul kernel + a lazy
96+ * transpose in `ops.transpose`, so all four can stay packed instead of FP32.
97+ */
98+ private fun quantBlockLayout (qt : GGMLQuantizationType ): Pair <Int , Int >? = when (qt) {
99+ GGMLQuantizationType .Q4_K -> 256 to 144
100+ GGMLQuantizationType .Q5_K -> 256 to 176
101+ GGMLQuantizationType .Q6_K -> 256 to 210
102+ GGMLQuantizationType .Q8_0 -> 32 to 34
96103 else -> null
97104}
98105
99106/* *
100- * Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the
101- * heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K /
102- * Q6_K). Performs the row-major → block-major relayout. Returns `null` for
103- * non-K-quant types (caller dequantizes those to FP32).
107+ * Pack raw GGUF `bytes` of logical `[out, in]` shape into the heap-packed block
108+ * tensor data the matmul kernels read directly (Q4_K / Q5_K / Q6_K / Q8_0).
109+ * Performs the row-major → block-major relayout. Returns `null` for types
110+ * without a packed kernel (caller dequantizes those to FP32).
111+ *
112+ * Q8_0 matters for gemma's tied `output`/lm_head: FunctionGemma's token_embd is
113+ * Q8_0, so keeping the lm_head packed (vs ~0.67 GB FP32) is what lets the eager
114+ * decode fit the 1.9 GB board, and it runs on the NEON Q8_0 kernel. (Requires
115+ * the Q8_0 case in `ops.transpose` — engine — so `linearProject` can transpose
116+ * the packed weight; see transformers #178.)
104117 *
105118 * commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena).
106119 */
@@ -109,13 +122,14 @@ internal fun <T : DType> packGemmaKQuant(
109122 qt : GGMLQuantizationType ,
110123 shape : Shape ,
111124): TensorData <T , * >? {
112- val bpb = kQuantBytesPerBlock (qt) ? : return null
113- val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb)
125+ val (blockElems, bpb) = quantBlockLayout (qt) ? : return null
126+ val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb, blockElems )
114127 @Suppress(" UNCHECKED_CAST" )
115128 return when (qt) {
116129 GGMLQuantizationType .Q4_K -> Q4_KBlockTensorData (shape, relaid) as TensorData <T , * >
117130 GGMLQuantizationType .Q5_K -> Q5_KBlockTensorData (shape, relaid) as TensorData <T , * >
118131 GGMLQuantizationType .Q6_K -> Q6_KBlockTensorData (shape, relaid) as TensorData <T , * >
132+ GGMLQuantizationType .Q8_0 -> Q8_0BlockTensorData (shape, relaid) as TensorData <T , * >
119133 else -> null
120134 }
121135}
0 commit comments