Merge pull request #179 from SKaiNET-developers/fix/gemma-board-embed-nocopy

michalharakal · web-flow · commit 689a283923ae · 2026-06-15T13:44:56.000+02:00
fix(gemma): keep tied Q8_0 lm_head packed in eager NATIVE_OPTIMIZED path (#178)
diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt
@@ -5,6 +5,7 @@ import sk.ainet.io.gguf.GGMLQuantizationType
 import sk.ainet.io.gguf.dequant.DequantOps
 import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData
 import sk.ainet.lang.tensor.data.IntArrayTensorData
 import sk.ainet.lang.tensor.data.TensorData
 import sk.ainet.lang.types.DType
@@ -48,8 +49,12 @@ public fun convertGemmaWeightsPacked(
                     tensor // unknown 2-D layout — leave as-is
                 } else {
                     val bytes = extractRawBytes(tensor.data)
-                    val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS ||
-                        name == Gemma4TensorNames.OUTPUT_WEIGHT
+                    // Only the token-embedding table is gathered (row lookup) and so
+                    // must be FP32 here. `output`/lm_head is a real matmul weight —
+                    // it stays packed (FunctionGemma's tied output is Q8_0 → NEON
+                    // Q8_0 kernel, transposed lazily by ops.transpose) instead of a
+                    // second ~0.67 GB FP32 copy that would OOM the 1.9 GB board.
+                    val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS
                     val packed = if (!isEmbed) packGemmaKQuant<FP32>(bytes, qt, shape) else null
                     when {
                         packed != null -> {
@@ -76,7 +81,11 @@ private fun dequantNoTranspose(
     ctx: ExecutionContext,
 ): Tensor<DType, Any> {
     val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
-    return ctx.fromFloatArray<FP32, Float>(shape, FP32::class, floats) as Tensor<DType, Any>
+    // Wrap the dequant array directly (no-copy) rather than ctx.fromFloatArray,
+    // which routes through BufferHandleFactory.owned and allocates a second
+    // full-size buffer — for the 262k×640 FP32 token_embd (~0.67 GB) that
+    // transient double is itself enough to OOM the 1.9 GB board.
+    return ctx.fromData(DenseFloatArrayTensorData<FP32>(shape, floats), FP32::class) as Tensor<DType, Any>
 }
 
 /**
diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt
@@ -5,6 +5,7 @@ import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
 import sk.ainet.lang.tensor.data.TensorData
 import sk.ainet.lang.types.DType
 
@@ -66,8 +67,8 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
     bytes: ByteArray,
     shape: Shape,
     bytesPerBlock: Int,
+    blockSize: Int = 256,
 ): ByteArray {
-    val blockSize = 256
     require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" }
     val outDim = shape[0]
     val inDim = shape[1]
@@ -88,19 +89,31 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
     return out
 }
 
-/** Bytes per ggml block for the K-quant types this packer handles. */
-private fun kQuantBytesPerBlock(qt: GGMLQuantizationType): Int? = when (qt) {
-    GGMLQuantizationType.Q4_K -> 144
-    GGMLQuantizationType.Q5_K -> 176
-    GGMLQuantizationType.Q6_K -> 210
+/**
+ * Block geometry `(blockElems, bytesPerBlock)` for the quant types this packer
+ * handles. The K-series are 256-element super-blocks; Q8_0 is a 32-element block
+ * (f16 scale + 32 int8). All four have a first-class CPU matmul kernel + a lazy
+ * transpose in `ops.transpose`, so all four can stay packed instead of FP32.
+ */
+private fun quantBlockLayout(qt: GGMLQuantizationType): Pair<Int, Int>? = when (qt) {
+    GGMLQuantizationType.Q4_K -> 256 to 144
+    GGMLQuantizationType.Q5_K -> 256 to 176
+    GGMLQuantizationType.Q6_K -> 256 to 210
+    GGMLQuantizationType.Q8_0 -> 32 to 34
     else -> null
 }
 
 /**
- * Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the
- * heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K /
- * Q6_K). Performs the row-major → block-major relayout. Returns `null` for
- * non-K-quant types (caller dequantizes those to FP32).
+ * Pack raw GGUF `bytes` of logical `[out, in]` shape into the heap-packed block
+ * tensor data the matmul kernels read directly (Q4_K / Q5_K / Q6_K / Q8_0).
+ * Performs the row-major → block-major relayout. Returns `null` for types
+ * without a packed kernel (caller dequantizes those to FP32).
+ *
+ * Q8_0 matters for gemma's tied `output`/lm_head: FunctionGemma's token_embd is
+ * Q8_0, so keeping the lm_head packed (vs ~0.67 GB FP32) is what lets the eager
+ * decode fit the 1.9 GB board, and it runs on the NEON Q8_0 kernel. (Requires
+ * the Q8_0 case in `ops.transpose` — engine — so `linearProject` can transpose
+ * the packed weight; see transformers #178.)
  *
  * commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena).
  */
@@ -109,13 +122,14 @@ internal fun <T : DType> packGemmaKQuant(
     qt: GGMLQuantizationType,
     shape: Shape,
 ): TensorData<T, *>? {
-    val bpb = kQuantBytesPerBlock(qt) ?: return null
-    val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb)
+    val (blockElems, bpb) = quantBlockLayout(qt) ?: return null
+    val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb, blockElems)
     @Suppress("UNCHECKED_CAST")
     return when (qt) {
         GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData<T, *>
         GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData<T, *>
         GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q8_0 -> Q8_0BlockTensorData(shape, relaid) as TensorData<T, *>
         else -> null
     }
 }