Merge pull request #195 from SKaiNET-developers/fix/llama-gguf-orientation

michalharakal · web-flow · commit e4a07991f6c3 · 2026-06-25T15:31:21.000+02:00
feat(llama): NATIVE_OPTIMIZED packed weight path (mirror Gemma)
diff --git a/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaNetworkLoader.kt b/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaNetworkLoader.kt
@@ -156,7 +156,18 @@ public class LlamaNetworkLoader @PublishedApi internal constructor(
             }
         }
 
-        return applyWeightsToNetwork(weights)
+        // NATIVE_OPTIMIZED keeps quantized tensors as raw 1-D bytes; convert them to the packed /
+        // FP32 forms the DSL matmul + gather paths consume (mirrors the Gemma packed path).
+        val ggufPolicy = (weightsProvider as? WeightsProvider.GgufSource)?.quantPolicy
+            ?: (weightsProvider as? WeightsProvider.GgufRandomAccess)?.quantPolicy
+        val finalWeights: DecoderGgufWeights<T, V> = if (ggufPolicy == QuantPolicy.NATIVE_OPTIMIZED) {
+            @Suppress("UNCHECKED_CAST")
+            convertLlamaWeightsPacked(weights, ctx) as DecoderGgufWeights<T, V>
+        } else {
+            weights
+        }
+
+        return applyWeightsToNetwork(finalWeights)
     }
 
     /**
diff --git a/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaPackedWeights.kt b/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaPackedWeights.kt
@@ -0,0 +1,109 @@
+package sk.ainet.models.llama
+
+import sk.ainet.context.ExecutionContext
+import sk.ainet.io.gguf.GGMLQuantizationType
+import sk.ainet.io.gguf.dequant.DequantOps
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData
+import sk.ainet.lang.tensor.data.IntArrayTensorData
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.DType
+import sk.ainet.lang.types.FP32
+
+/**
+ * commonMain (Kotlin/Native-capable) converter for `NATIVE_OPTIMIZED` Llama weights — the Llama
+ * analogue of `convertGemmaWeightsPacked`. Turns the raw-byte quantized tensors a NATIVE_OPTIMIZED
+ * load produces into the forms the DSL matmul path consumes:
+ *
+ * - **Q4_K / Q5_K / Q6_K / Q8_0 matmul weights** → heap-packed `Q*BlockTensorData` (keep the
+ *   GGUF footprint; run the in-kernel dequant matmul, NEON on the board).
+ * - **token_embd** → FP32 dequant in `[vocab, embed]` order (gathered, not matmul'd; no transpose).
+ * - **everything else quantized without a packed kernel** → FP32 dequant transposed to `[out, in]`.
+ *
+ * No `java.lang.foreign` — runs on the board (Kotlin/Native) and JVM alike.
+ */
+public fun convertLlamaWeightsPacked(
+    weights: DecoderGgufWeights<*, *>,
+    ctx: ExecutionContext,
+): DecoderGgufWeights<*, *> {
+    @Suppress("UNCHECKED_CAST")
+    val typed = weights as DecoderGgufWeights<DType, Any>
+    val quantTypes = typed.quantTypes
+    if (quantTypes.isEmpty()) return weights
+
+    val newTensors = linkedMapOf<String, Tensor<DType, Any>>()
+    for ((name, tensor) in typed.tensors) {
+        val qt = quantTypes[name]
+        newTensors[name] = when {
+            qt == null -> tensor // not quantized (norms, f32)
+            else -> {
+                val shape = logicalShapeFor(name, typed.metadata)
+                if (shape == null) {
+                    tensor // unknown 2-D layout — leave as-is
+                } else {
+                    val bytes = extractRawBytes(tensor.data)
+                    // token_embd is gathered (row lookup) → must be FP32. Other matrices (incl.
+                    // output/lm_head) stay packed and run the in-kernel matmul.
+                    val isEmbed = name == LlamaTensorNames.TOKEN_EMBEDDINGS
+                    val packed = if (!isEmbed) packLlamaKQuant<FP32>(bytes, qt, shape) else null
+                    when {
+                        packed != null -> {
+                            @Suppress("UNCHECKED_CAST")
+                            ctx.fromData(packed as TensorData<FP32, Float>, FP32::class) as Tensor<DType, Any>
+                        }
+                        isEmbed -> dequantNoTranspose(bytes, qt, shape, ctx)
+                        else -> dequantTransposed(bytes, qt, shape, ctx)
+                    }
+                }
+            }
+        }
+    }
+    @Suppress("UNCHECKED_CAST")
+    return DecoderGgufWeights(typed.metadata, newTensors, typed.quantTypes) as DecoderGgufWeights<*, *>
+}
+
+/** Dequant to FP32 in natural `[rows, cols]` order (embeddings — gathered, not matmul'd). */
+@Suppress("UNCHECKED_CAST")
+private fun dequantNoTranspose(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+    ctx: ExecutionContext,
+): Tensor<DType, Any> {
+    val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
+    return ctx.fromData(DenseFloatArrayTensorData<FP32>(shape, floats), FP32::class) as Tensor<DType, Any>
+}
+
+/** Dequant to canonical FP32 `[out, in]` row-major (GGUF is column-major within a row). */
+@Suppress("UNCHECKED_CAST")
+private fun dequantTransposed(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+    ctx: ExecutionContext,
+): Tensor<DType, Any> {
+    val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
+    val out = shape[0]
+    val inDim = shape[1]
+    val rowMajor = DequantOps.transposeColumnMajorToRowMajor(floats, inDim, out)
+    return ctx.fromFloatArray<FP32, Float>(shape, FP32::class, rowMajor) as Tensor<DType, Any>
+}
+
+/** Read raw packed bytes back from a NATIVE_OPTIMIZED quant tensor (JVM IntArray / Native Byte). */
+internal fun extractRawBytes(data: TensorData<*, *>): ByteArray {
+    if (data is IntArrayTensorData<*>) {
+        val buf = data.buffer
+        return ByteArray(buf.size) { buf[it].toByte() }
+    }
+    val n = data.shape.volume
+    @Suppress("UNCHECKED_CAST")
+    val d = data as TensorData<*, Any?>
+    return ByteArray(n) {
+        when (val v = d[it]) {
+            is Byte -> v
+            is Int -> v.toByte()
+            else -> error("convertLlamaWeightsPacked: cannot read bytes from ${data::class.simpleName}")
+        }
+    }
+}
diff --git a/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaQuantLayout.kt b/llm-inference/llama/src/commonMain/kotlin/sk/ainet/models/llama/LlamaQuantLayout.kt
@@ -0,0 +1,106 @@
+package sk.ainet.models.llama
+
+import sk.ainet.io.gguf.GGMLQuantizationType
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.DType
+
+/**
+ * Platform-neutral (commonMain) layout helpers for Llama quantized weights — the Llama analogue
+ * of `GemmaQuantLayout`. A `NATIVE_OPTIMIZED` load stores quantized tensors as 1-D byte arrays,
+ * so the converter needs the original `[out, in]` shape (from metadata) to relayout blocks.
+ */
+
+/**
+ * Recover the logical 2-D `[out, in]` shape of a Llama weight from its GGUF name + metadata.
+ * Null for tensors without a 2-D matmul layout (norms etc.). Llama has uniform per-layer dims,
+ * so metadata is authoritative.
+ */
+internal fun logicalShapeFor(name: String, metadata: LlamaModelMetadata): Shape? {
+    val embed = metadata.embeddingLength
+    val vocab = metadata.vocabSize
+    val headDim = if (metadata.headCount > 0) embed / metadata.headCount else 0
+    val qDim = metadata.headCount * headDim
+    val kvDim = metadata.kvHeadCount * headDim
+    val ffn = metadata.feedForwardLength
+    return when {
+        name == LlamaTensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed)
+        name == LlamaTensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed)
+        name.startsWith("blk.") -> when {
+            name.endsWith(".attn_q.weight") -> Shape(qDim, embed)
+            name.endsWith(".attn_k.weight") -> Shape(kvDim, embed)
+            name.endsWith(".attn_v.weight") -> Shape(kvDim, embed)
+            name.endsWith(".attn_output.weight") -> Shape(embed, qDim)
+            name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed)
+            name.endsWith(".ffn_up.weight") -> Shape(ffn, embed)
+            name.endsWith(".ffn_down.weight") -> Shape(embed, ffn)
+            else -> null
+        }
+        else -> null
+    }
+}
+
+/**
+ * Re-layout GGUF K-series bytes from row-major block order to the input-block-major order the
+ * `matmulQ{K}` kernels expect. For a `[outDim, inDim]` weight with `inDim % 256 == 0` this is a
+ * block-level 2-D transpose; bytes inside a block are untouched. (Mirror of GemmaQuantLayout.)
+ */
+internal fun relayoutKSeriesRowMajorToBlockMajor(
+    bytes: ByteArray,
+    shape: Shape,
+    bytesPerBlock: Int,
+    blockSize: Int = 256,
+): ByteArray {
+    require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" }
+    val outDim = shape[0]
+    val inDim = shape[1]
+    require(inDim % blockSize == 0) { "K-series weight inDim ($inDim) must be a multiple of $blockSize" }
+    val blocksPerRow = inDim / blockSize
+    val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong()
+    require(bytes.size.toLong() >= expected) {
+        "K-series byte buffer ${bytes.size} < expected $expected for [$outDim, $inDim] @ ${bytesPerBlock}B/block"
+    }
+    val out = ByteArray(bytes.size)
+    for (r in 0 until outDim) {
+        for (b in 0 until blocksPerRow) {
+            val srcOff = (r * blocksPerRow + b) * bytesPerBlock
+            val dstOff = (b * outDim + r) * bytesPerBlock
+            bytes.copyInto(out, dstOff, srcOff, srcOff + bytesPerBlock)
+        }
+    }
+    return out
+}
+
+private fun quantBlockLayout(qt: GGMLQuantizationType): Pair<Int, Int>? = when (qt) {
+    GGMLQuantizationType.Q4_K -> 256 to 144
+    GGMLQuantizationType.Q5_K -> 256 to 176
+    GGMLQuantizationType.Q6_K -> 256 to 210
+    GGMLQuantizationType.Q8_0 -> 32 to 34
+    else -> null
+}
+
+/**
+ * Pack raw GGUF `bytes` of logical `[out, in]` shape into heap-packed block tensor data the
+ * matmul kernels read directly (Q4_K / Q5_K / Q6_K / Q8_0), with the row-major → block-major
+ * relayout. Null for types without a packed kernel (caller dequantizes those to FP32).
+ */
+internal fun <T : DType> packLlamaKQuant(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+): TensorData<T, *>? {
+    val (blockElems, bpb) = quantBlockLayout(qt) ?: return null
+    val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb, blockElems)
+    @Suppress("UNCHECKED_CAST")
+    return when (qt) {
+        GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q8_0 -> Q8_0BlockTensorData(shape, relaid) as TensorData<T, *>
+        else -> null
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,18 @@ public class LlamaNetworkLoader @PublishedApi internal constructor(`
`156`	`156`	`}`
`157`	`157`	`}`
`158`	`158`
`159`		`- return applyWeightsToNetwork(weights)`
	`159`	`+ // NATIVE_OPTIMIZED keeps quantized tensors as raw 1-D bytes; convert them to the packed /`
	`160`	`+ // FP32 forms the DSL matmul + gather paths consume (mirrors the Gemma packed path).`
	`161`	`+ val ggufPolicy = (weightsProvider as? WeightsProvider.GgufSource)?.quantPolicy`
	`162`	`+ ?: (weightsProvider as? WeightsProvider.GgufRandomAccess)?.quantPolicy`
	`163`	`+ val finalWeights: DecoderGgufWeights<T, V> = if (ggufPolicy == QuantPolicy.NATIVE_OPTIMIZED) {`
	`164`	`+ @Suppress("UNCHECKED_CAST")`
	`165`	`+ convertLlamaWeightsPacked(weights, ctx) as DecoderGgufWeights<T, V>`
	`166`	`+ } else {`
	`167`	`+ weights`
	`168`	`+ }`
	`169`	`+`
	`170`	`+ return applyWeightsToNetwork(finalWeights)`
`160`	`171`	`}`
`161`	`172`
`162`	`173`	`/**`