Merge pull request #196 from SKaiNET-developers/perf/fused-decode-attention

michalharakal · web-flow · commit 3179e9e2acd2 · 2026-06-25T15:32:02.000+02:00
perf(mha)+fix(rope): fused decode-attention &amp; traceable interleaved RoPE
diff --git a/transformer-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt b/transformer-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt
@@ -333,6 +333,24 @@ public class MultiHeadAttention<T : DType, V>(
             mhaDumpStat("[blk.0.mha cached-V (full)  ]", fullV)
         }
 
+        // Fused decode-attention fast path — the hot autoregressive case.
+        // When seqQ == 1 (one token per forward), self-attention, and no
+        // sliding-window mask, compute scores → softmax → (GQA) weighted-V
+        // directly from the cached K/V buffers in a single buffer-direct pass,
+        // emitting the merged [1, qDim] output. This skips repeatKVHeads' concat
+        // (built every token/layer), the unsqueeze → SDPA → squeeze → permute
+        // chain, and every intermediate tensor those allocate — which the
+        // jstack profile (docs/upstream/A2-PROFILE.md) showed dominate decode.
+        // Numerically identical to the general path below for seqLen 1 (same
+        // max-stable softmax, same GQA head mapping head h → kv head h/nRep).
+        if (qSeqLen == 1 && !isCrossAttention && slidingWindow == null) {
+            val merged = fusedDecodeAttention(q, fullK, fullV, scale, ctx)
+            var output = linearProject(ops, merged, wO)
+            if (bias) output = ops.add(output, params[oWIdx + 1].value)
+            if (mhaDump) mhaDumpStat("[blk.0.mha post-fused-decode ]", output)
+            return output
+        }
+
         // Expand KV heads for GQA if needed
         val expandedK = if (nKVHeads < nHeads) repeatKVHeads(fullK, nHeads / nKVHeads, ops) else fullK
         val expandedV = if (nKVHeads < nHeads) repeatKVHeads(fullV, nHeads / nKVHeads, ops) else fullV
@@ -391,6 +409,69 @@ public class MultiHeadAttention<T : DType, V>(
         return output
     }
 
+    /**
+     * Fused single-token (decode) attention. [q] is `[nHeads, 1, headDim]`
+     * (heads-first, post-RoPE); [fullK]/[fullV] are `[nKVHeads, seqKV, headDim]`
+     * (post-cache, post-V-norm). Returns the merged `[1, qDim]` context where
+     * row 0 is the concatenation of each head's output — exactly what the
+     * general SDPA + squeeze + swapSeqHeadDims + reshape chain produces for
+     * seqLen 1, but with zero intermediate tensors. GQA query head `h` reads KV
+     * head `h / (nHeads / nKVHeads)`, matching [repeatKVHeads].
+     */
+    private fun fusedDecodeAttention(
+        q: Tensor<T, V>,
+        fullK: Tensor<T, V>,
+        fullV: Tensor<T, V>,
+        scale: Float,
+        ctx: ExecutionContext,
+    ): Tensor<T, V> {
+        val qBuf = q.data.copyToFloatArray()        // [nHeads * headDim]
+        val kBuf = fullK.data.copyToFloatArray()    // [nKVHeads * seqKV * headDim]
+        val vBuf = fullV.data.copyToFloatArray()    // [nKVHeads * seqKV * headDim]
+        val seqKV = fullK.shape[1]
+        val nRep = nHeads / nKVHeads
+        val out = FloatArray(nHeads * headDim)      // == qDim, row-major [h, d]
+        val scores = FloatArray(seqKV)
+        for (h in 0 until nHeads) {
+            val g = h / nRep                        // GQA: which KV head this query head reads
+            val qOff = h * headDim
+            val kvHeadBase = g * seqKV * headDim
+            // scores[ki] = (q_h · k_{g,ki}) * scale, tracking the max for a stable softmax
+            var maxV = Float.NEGATIVE_INFINITY
+            for (ki in 0 until seqKV) {
+                val kOff = kvHeadBase + ki * headDim
+                var dot = 0f
+                for (d in 0 until headDim) dot += qBuf[qOff + d] * kBuf[kOff + d]
+                val s = dot * scale
+                scores[ki] = s
+                if (s > maxV) maxV = s
+            }
+            // softmax over keys
+            var sum = 0f
+            for (ki in 0 until seqKV) {
+                val e = kotlin.math.exp(scores[ki] - maxV)
+                scores[ki] = e
+                sum += e
+            }
+            val inv = if (sum > 0f) 1f / sum else 0f
+            // context_h = Σ_ki softmax_ki * v_{g,ki}
+            val oOff = h * headDim
+            for (d in 0 until headDim) {
+                var acc = 0f
+                for (ki in 0 until seqKV) {
+                    acc += scores[ki] * vBuf[kvHeadBase + ki * headDim + d]
+                }
+                out[oOff + d] = acc * inv
+            }
+        }
+        @Suppress("UNCHECKED_CAST")
+        return ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(Shape(1, qDim), out)
+                as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            q.dtype,
+        )
+    }
+
     /**
      * Build an additive mask tensor of shape `[1, 1, seqQ, seqKV]` where allowed
      * (query, key) cells are 0 and masked cells are a large negative value so
diff --git a/transformer-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/RoPE.kt b/transformer-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/RoPE.kt
@@ -4,6 +4,7 @@ import sk.ainet.context.ExecutionContext
 import sk.ainet.lang.nn.Module
 import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.ops.KspTensorOps
 import sk.ainet.lang.types.DType
 import kotlin.math.cos
 import kotlin.math.pow
@@ -259,6 +260,16 @@ public class RoPE<T : DType, V>(
      * `headDim - rotaryDim` floats of every head are left untouched.
      */
     private fun applyRoPEInterleaved(input: Tensor<T, V>, position: Int, ctx: ExecutionContext): Tensor<T, V> {
+        // Graph tracing: the raw-array path below reads input.data and rebuilds via
+        // fromFloatArray, which records the rotated Q/K as a DISCONNECTED CONSTANT —
+        // severing the link to the projection weights. Post-GQA-broadcast that lowers
+        // to a slice-into-empty const cascade that crashes iree-compile. Under the
+        // tracing wrapper (KspTensorOps), take the traceable op-based path so the
+        // rotation is recorded as tensor ops. Full-rotary only (TinyLlama/Llama/
+        // Mistral); partial rotary keeps the raw path (no GGUF model needs it traced).
+        if (rotaryDim == headDim && input.ops is KspTensorOps) {
+            return applyRoPEInterleavedOps(input, position, ctx)
+        }
         val data = input.data.copyToFloatArray()
         val lastDim = input.shape[input.rank - 1]
         require(lastDim == headDim) { "RoPE input last dim ($lastDim) != headDim ($headDim)" }
@@ -287,4 +298,69 @@ public class RoPE<T : DType, V>(
 
         return ctx.fromFloatArray(input.shape, input.dtype, data)
     }
+
+    /**
+     * Traceable interleaved RoPE: pure tensor ops, numerically identical to
+     * [applyRoPEInterleaved] but recordable to a compute graph. Used under
+     * void/graph tracing where the raw-array path bakes a disconnected constant.
+     *
+     * Interleaved pairing `(x[2i], x[2i+1])` is realized by reshaping the head
+     * dim `[headDim] -> [halfRotary, 2]` (row-major: `[i,0]=x[2i]`, `[i,1]=x[2i+1]`),
+     * rotating the even/odd planes, then reshaping back. Full-rotary only
+     * (`rotaryDim == headDim`); the caller gates on that.
+     */
+    private fun applyRoPEInterleavedOps(input: Tensor<T, V>, position: Int, ctx: ExecutionContext): Tensor<T, V> {
+        val ops = ctx.ops
+        val rank = input.rank
+        val lastDim = input.shape[rank - 1]
+        require(lastDim == headDim) { "RoPE input last dim ($lastDim) != headDim ($headDim)" }
+        val seqLen = input.shape[rank - 2]
+
+        // cos/sin tables [seqLen, halfRotary] for the requested positions — same
+        // tables as the raw path, so the rotation is bit-for-bit equivalent.
+        val cosData = FloatArray(seqLen * halfRotary)
+        val sinData = FloatArray(seqLen * halfRotary)
+        for (s in 0 until seqLen) {
+            val pos = position + s
+            for (i in 0 until halfRotary) {
+                cosData[s * halfRotary + i] = cosTable[pos * halfRotary + i]
+                sinData[s * halfRotary + i] = sinTable[pos * halfRotary + i]
+            }
+        }
+        val tableShape = Shape(seqLen, halfRotary)
+        @Suppress("UNCHECKED_CAST")
+        val cosTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(tableShape, cosData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype,
+        )
+        @Suppress("UNCHECKED_CAST")
+        val sinTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(tableShape, sinData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype,
+        )
+
+        // [..., seqLen, headDim] -> [..., seqLen, halfRotary, 2] so interleaved pairs
+        // land on the trailing size-2 axis.
+        val leading = IntArray(rank - 1) { input.shape[it] }
+        val pairedShape = Shape(*leading, halfRotary, 2)
+        val paired = ops.reshape(input, pairedShape)
+
+        // even = pairs[..., 0], odd = pairs[..., 1] (narrow the size-2 axis, drop it).
+        val pairAxis = rank // trailing axis index of pairedShape
+        val planeShape = Shape(*leading, halfRotary)
+        val even = ops.reshape(ops.narrow(paired, pairAxis, 0, 1), planeShape) // [..., seqLen, halfRotary]
+        val odd = ops.reshape(ops.narrow(paired, pairAxis, 1, 1), planeShape)
+
+        // (even, odd) -> (even*cos - odd*sin, even*sin + odd*cos); cos/sin [seqLen, halfRotary]
+        // broadcast over the leading (head/batch) dims.
+        val rotEven = ops.subtract(ops.multiply(even, cosTensor), ops.multiply(odd, sinTensor))
+        val rotOdd = ops.add(ops.multiply(even, sinTensor), ops.multiply(odd, cosTensor))
+
+        // Re-interleave: stack on a new trailing axis -> [..., halfRotary, 2] -> [..., headDim].
+        val recombined = ops.concat(
+            listOf(ops.unsqueeze(rotEven, rank), ops.unsqueeze(rotOdd, rank)),
+            dim = rank,
+        )
+        return ops.reshape(recombined, input.shape)
+    }
 }