Merge pull request #81 from SKaiNET-developers/feature/ISSUE-80-mha-reshape-permute

michalharakal · web-flow · commit f6e02a510e96 · 2026-04-28T11:30:24.000+02:00
fix(mha): materialise multi-head reshape — fixes forwardBatched divergence
diff --git a/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt b/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt
@@ -197,11 +197,29 @@ public class MultiHeadAttention<T : DType, V>(
             v = ops.add(v, params[vWIdx + 1].value)
         }
 
-        // Reshape to multi-head: [seqLen, dim] → [nHeads, seqLen, headDim]
+        // Reshape to multi-head and put heads first.
+        //
+        // Q/K/V projections produce [seqLen, qDim] where qDim = nHeads*headDim.
+        // Row-major flat layout is [s, h, d] → s*qDim + h*headDim + d. SDPA
+        // expects [batch, nHeads, seqLen, headDim] — i.e. heads-first layout
+        // [h, s, d] → h*seqLen*headDim + s*headDim + d.
+        //
+        // For seqLen == 1 the two layouts coincide flat-byte-for-flat-byte,
+        // so a naked `reshape(t, Shape(nHeads, seqLen, headDim))` was visibly
+        // correct in the autoregressive (one-token-per-forward) path. For
+        // seqLen > 1 it silently reorders the data: `t.get(h, s, d)` reads
+        // `data[h*N*headDim + s*headDim + d]` from a buffer laid out as
+        // `s*nHeads*headDim + h*headDim + d`, mixing the rows of head h with
+        // values from other heads. That is the root cause of the batched-
+        // prefill divergence (commit `bd3eb9c`).
+        //
+        // The correct transformation needs an explicit dim-0/dim-1 swap.
+        // SKaiNET's `ops.transpose` only swaps the LAST two dims, so we
+        // can't reuse it here; we materialise the permute via a copy.
         val seqLen = if (input.rank >= 2) input.shape[input.rank - 2] else 1
-        q = ops.reshape(q, Shape(nHeads, seqLen, headDim))
-        k = ops.reshape(k, Shape(nKVHeads, seqLen, headDim))
-        var vReshaped = ops.reshape(v, Shape(nKVHeads, seqLen, headDim))
+        q = swapSeqHeadDims(ops.reshape(q, Shape(seqLen, nHeads, headDim)), ctx)
+        k = swapSeqHeadDims(ops.reshape(k, Shape(seqLen, nKVHeads, headDim)), ctx)
+        var vReshaped = swapSeqHeadDims(ops.reshape(v, Shape(seqLen, nKVHeads, headDim)), ctx)
 
         // Optional QK-Norm
         if (qNorm != null && kNorm != null) {
@@ -276,9 +294,18 @@ public class MultiHeadAttention<T : DType, V>(
         )
         if (mhaDump) mhaDumpStat("[blk.0.mha post-SDPA        ]", attnOut)
 
-        // Remove batch dim and merge heads: [1, nHeads, seqLen, headDim] → [seqLen, qDim]
+        // Remove batch dim and merge heads.
+        //
+        // SDPA returns [1, nHeads, seqLen, headDim]. We need [seqLen, qDim].
+        // Symmetric inverse of the heads-first permute on the input side:
+        // first squeeze the batch dim → [nHeads, seqLen, headDim], then
+        // swap dims 0/1 → [seqLen, nHeads, headDim], finally reshape to
+        // [seqLen, qDim] (contiguous: row s = concatenation of head 0..N-1
+        // for that token). For seqLen == 1 the swap is identity, so this
+        // matches the prior naked reshape for the autoregressive case.
         val squeezed = ops.squeeze(attnOut, 0)
-        val merged = ops.reshape(squeezed, Shape(seqLen, qDim))
+        val swappedBack = swapSeqHeadDims(squeezed, ctx)
+        val merged = ops.reshape(swappedBack, Shape(seqLen, qDim))
 
         // Output projection: merged @ wO^T (+ bias if enabled)
         var output = linearProject(ops, merged, wO)
@@ -333,6 +360,38 @@ public class MultiHeadAttention<T : DType, V>(
         )
     }
 
+    /**
+     * Swap dims 0 and 1 of a rank-3 tensor: `[D0, D1, D2]` → `[D1, D0, D2]`.
+     *
+     * SKaiNET's [TensorOps.transpose] only swaps the last two dims, so this
+     * transformation is materialised via a copy. For `D0 == 1` or `D1 == 1`
+     * the result has the same flat layout as the input, but we still pay
+     * the copy cost; callers that know seqLen == 1 can short-circuit.
+     */
+    private fun swapSeqHeadDims(t: Tensor<T, V>, ctx: ExecutionContext): Tensor<T, V> {
+        require(t.rank == 3) { "swapSeqHeadDims: expected rank-3 tensor, got rank ${t.rank}" }
+        val d0 = t.shape[0]
+        val d1 = t.shape[1]
+        val d2 = t.shape[2]
+        if (d0 == 1 || d1 == 1) {
+            // Layouts coincide; just reinterpret the shape.
+            return ctx.ops.reshape(t, Shape(d1, d0, d2))
+        }
+        val src = t.data.copyToFloatArray()
+        val out = FloatArray(d1 * d0 * d2)
+        for (i in 0 until d0) {
+            for (j in 0 until d1) {
+                val srcOff = (i * d1 + j) * d2
+                val dstOff = (j * d0 + i) * d2
+                src.copyInto(out, dstOff, srcOff, srcOff + d2)
+            }
+        }
+        @Suppress("UNCHECKED_CAST")
+        val data = sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(Shape(d1, d0, d2), out)
+            as sk.ainet.lang.tensor.data.TensorData<T, V>
+        return ctx.fromData(data, t.dtype)
+    }
+
     private fun repeatKVHeads(t: Tensor<T, V>, repeats: Int, ops: sk.ainet.lang.tensor.ops.TensorOps): Tensor<T, V> {
         if (repeats == 1) return t
         // Repeat each KV head individually so head mapping matches GQA:
diff --git a/llm-runtime/kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/BatchedPrefillEquivalenceTest.kt b/llm-runtime/kllama/src/jvmTest/kotlin/sk/ainet/apps/kllama/BatchedPrefillEquivalenceTest.kt
@@ -0,0 +1,210 @@
+package sk.ainet.apps.kllama
+
+import java.nio.file.Path
+import kotlin.io.path.exists
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+import kotlinx.coroutines.runBlocking
+import sk.ainet.apps.llm.OptimizedLLMMode
+import sk.ainet.apps.llm.OptimizedLLMRuntime
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.io.JvmRandomAccessSource
+import sk.ainet.io.model.QuantPolicy
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData
+import sk.ainet.lang.tensor.data.MemorySegmentTensorData
+import sk.ainet.lang.types.FP32
+import sk.ainet.models.llama.LlamaNetworkLoader
+
+/**
+ * Verifies that `forwardBatched(IntArray)` produces the same last-position
+ * logits as the equivalent autoregressive `forward(t)` per token. This is
+ * the regression test the `bd3eb9c` revert was missing — without it,
+ * batched prefill quietly diverged from the autoregressive baseline.
+ *
+ * Uses TinyLlama 1.1B Q8_0 (DEQUANTIZE_TO_FP32 policy → pure FP32 forward
+ * pass). This sidesteps the Gemma 4 forward-pass correctness issues
+ * tracked separately on develop, so this test is a clean check on the
+ * batched-vs-autoregressive plumbing only.
+ *
+ * Skipped if the model is not present.
+ */
+class BatchedPrefillEquivalenceTest {
+
+    companion object {
+        private val MODEL_PATH = Path.of(
+            System.getProperty("user.home"),
+            ".lmstudio/models/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+            "tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
+        )
+    }
+
+    @Test
+    fun `forwardBatched matches autoregressive at N=1`() {
+        runEquivalence(intArrayOf(450)) // first prompt token only — should be trivial
+    }
+
+    @Test
+    fun `forwardBatched matches autoregressive at N=2`() {
+        runEquivalence(intArrayOf(450, 7483))
+    }
+
+    @Test
+    fun `forwardBatched matches autoregressive prefill at last position`() {
+        if (!MODEL_PATH.exists()) {
+            println("[skip] Model not at $MODEL_PATH")
+            return
+        }
+        runBlocking {
+            // Fixed prompt — encode once, replay through both paths.
+            // Tokenizer is loaded but the integer prompt is what we feed.
+            val ctx = DirectCpuExecutionContext()
+            val tokenizer = JvmRandomAccessSource.open(MODEL_PATH.toString()).use { source ->
+                GGUFTokenizer.fromRandomAccessSource(source)
+            }
+            val prompt = "The capital of France is"
+            val promptTokens = tokenizer.encode(prompt)
+            require(promptTokens.size >= 2) { "Need ≥2 tokens to exercise the loop" }
+            println("[diag] prompt tokens: ${promptTokens.toList()}")
+
+            // --- Autoregressive baseline ---
+            val autoLogits = run {
+                val model = LlamaNetworkLoader.fromGguf(
+                    randomAccessProvider = { JvmRandomAccessSource.open(MODEL_PATH.toString()) },
+                    quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32
+                ).load<FP32, Float>(ctx)
+                val runtime = OptimizedLLMRuntime(
+                    model = model,
+                    ctx = ctx,
+                    mode = OptimizedLLMMode.DIRECT,
+                    dtype = FP32::class
+                )
+                var l: Tensor<FP32, Float> = runtime.forward(promptTokens[0])
+                for (i in 1 until promptTokens.size) {
+                    l = runtime.forward(promptTokens[i])
+                }
+                extractLogits(l)
+            }
+
+            // --- Batched ---
+            val batchLogits = run {
+                val model = LlamaNetworkLoader.fromGguf(
+                    randomAccessProvider = { JvmRandomAccessSource.open(MODEL_PATH.toString()) },
+                    quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32
+                ).load<FP32, Float>(ctx)
+                val runtime = OptimizedLLMRuntime(
+                    model = model,
+                    ctx = ctx,
+                    mode = OptimizedLLMMode.DIRECT,
+                    dtype = FP32::class
+                )
+                extractLogits(runtime.forwardBatched(promptTokens))
+            }
+
+            // --- Compare ---
+            assertEquals(autoLogits.size, batchLogits.size,
+                "logit vector length mismatch")
+            val tol = 1e-3f
+            var maxAbsDiff = 0f
+            var maxRelDiff = 0f
+            var argmaxAuto = 0
+            var argmaxBatch = 0
+            for (i in autoLogits.indices) {
+                val a = autoLogits[i]
+                val b = batchLogits[i]
+                val d = kotlin.math.abs(a - b)
+                if (d > maxAbsDiff) maxAbsDiff = d
+                val r = if (kotlin.math.abs(a) > 1e-6f) d / kotlin.math.abs(a) else 0f
+                if (r > maxRelDiff) maxRelDiff = r
+                if (a > autoLogits[argmaxAuto]) argmaxAuto = i
+                if (b > batchLogits[argmaxBatch]) argmaxBatch = i
+            }
+            println("[diag] max_abs_diff=$maxAbsDiff max_rel_diff=$maxRelDiff " +
+                "argmax_auto=$argmaxAuto argmax_batch=$argmaxBatch " +
+                "auto[argmax]=${autoLogits[argmaxAuto]} " +
+                "batch[argmax]=${batchLogits[argmaxBatch]}")
+            assertEquals(argmaxAuto, argmaxBatch,
+                "argmax token differs: auto=$argmaxAuto batch=$argmaxBatch")
+            assertTrue(maxAbsDiff < tol,
+                "max_abs_diff=$maxAbsDiff exceeds tolerance $tol; " +
+                    "batched prefill diverges from autoregressive")
+        }
+    }
+
+    private fun runEquivalence(promptTokens: IntArray) {
+        if (!MODEL_PATH.exists()) {
+            println("[skip] Model not at $MODEL_PATH")
+            return
+        }
+        runBlocking {
+            val ctx = DirectCpuExecutionContext()
+            println("[diag] N=${promptTokens.size} prompt tokens: ${promptTokens.toList()}")
+
+            val autoLogits = run {
+                val model = LlamaNetworkLoader.fromGguf(
+                    randomAccessProvider = { JvmRandomAccessSource.open(MODEL_PATH.toString()) },
+                    quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32
+                ).load<FP32, Float>(ctx)
+                val runtime = OptimizedLLMRuntime(
+                    model = model, ctx = ctx,
+                    mode = OptimizedLLMMode.DIRECT, dtype = FP32::class
+                )
+                var l: Tensor<FP32, Float> = runtime.forward(promptTokens[0])
+                for (i in 1 until promptTokens.size) l = runtime.forward(promptTokens[i])
+                extractLogits(l)
+            }
+            val batchLogits = run {
+                val model = LlamaNetworkLoader.fromGguf(
+                    randomAccessProvider = { JvmRandomAccessSource.open(MODEL_PATH.toString()) },
+                    quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32
+                ).load<FP32, Float>(ctx)
+                val runtime = OptimizedLLMRuntime(
+                    model = model, ctx = ctx,
+                    mode = OptimizedLLMMode.DIRECT, dtype = FP32::class
+                )
+                extractLogits(runtime.forwardBatched(promptTokens))
+            }
+            assertEquals(autoLogits.size, batchLogits.size)
+            var maxAbsDiff = 0f
+            var argmaxAuto = 0
+            var argmaxBatch = 0
+            for (i in autoLogits.indices) {
+                val d = kotlin.math.abs(autoLogits[i] - batchLogits[i])
+                if (d > maxAbsDiff) maxAbsDiff = d
+                if (autoLogits[i] > autoLogits[argmaxAuto]) argmaxAuto = i
+                if (batchLogits[i] > batchLogits[argmaxBatch]) argmaxBatch = i
+            }
+            println("[diag] N=${promptTokens.size} max_abs_diff=$maxAbsDiff " +
+                "argmax_auto=$argmaxAuto argmax_batch=$argmaxBatch " +
+                "auto_top=${autoLogits[argmaxAuto]} batch_top=${batchLogits[argmaxBatch]}")
+            assertEquals(argmaxAuto, argmaxBatch,
+                "argmax differs at N=${promptTokens.size}")
+            assertTrue(maxAbsDiff < 1e-3f,
+                "max_abs_diff=$maxAbsDiff exceeds 1e-3 at N=${promptTokens.size}")
+        }
+    }
+
+    private fun extractLogits(t: Tensor<FP32, Float>): FloatArray {
+        val data = t.data
+        return when (data) {
+            is DenseFloatArrayTensorData<*> -> {
+                val n = t.shape.volume
+                if (data.buffer.size == n) data.buffer.copyOf()
+                else data.buffer.copyOf(n)
+            }
+            is MemorySegmentTensorData<*> -> {
+                val n = t.shape.volume
+                val out = FloatArray(n)
+                java.lang.foreign.MemorySegment.copy(
+                    data.segment,
+                    java.lang.foreign.ValueLayout.JAVA_FLOAT,
+                    data.segmentByteOffset,
+                    out, 0, n
+                )
+                out
+            }
+            else -> error("Unsupported tensor data type: ${data::class}")
+        }
+    }
+}