fix(memory): heap-wrap remaining hot-path tensor allocs — stop direct-memory leak

michalharakal · michalharakal · commit 1e49140078c7 · 2026-04-28T07:57:20.000+02:00
Same root cause as 319c394 (sliceView): ctx.fromFloatArray copies the input FloatArray into a fresh MemorySegment from Arena.ofAuto(). Direct memory doesn't pressure the GC, so per-forward auto-arenas accumulate until -XX:MaxDirectMemorySize is exhausted. Empirically: smoke test went from a 45 GB direct-memory OOM mid-prefill to a 271 MB net direct-memory growth across the full 27 min forward, with the resident JVM staying inside the 32 GB cap. Fixed sites (all on the per-token / per-layer path): - RoPE.applyRoPESplitHalf: cos/sin tables (sliding layers, partial=1.0) - RoPE.applyRoPESplitHalfFull: cos/sin tables (full layers, partial=0.25) - MultiHeadAttention.buildSlidingCausalMask: mask tensor (every block using the sliding path, every forward) - GemmaModel softcap: scale + inv scalar tensors (every forward) - PaddedSharedPositionalKVCache.padHeadDim: padded V (Gemma 4 value-head padding when src/target head_dim differ) Each site now wraps the FloatArray as DenseFloatArrayTensorData and goes through ctx.fromData, which keeps the storage on the heap and lets the GC reclaim it normally. Tool-call format regression on the smoke test prompt is tracked separately; this commit only fixes the runnability OOM.
diff --git a/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/KVCache.kt b/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/KVCache.kt
@@ -473,10 +473,15 @@ public class PaddedSharedPositionalKVCache<T : DType, V>(
                 // remaining [srcHeadDim, targetHeadDim) stays zero
             }
         }
-        return ctx.fromFloatArray<T, V>(
-            sk.ainet.lang.tensor.Shape(nKV, seq, targetHeadDim),
-            t.dtype,
-            out
+        // Heap-backed wrap — fromFloatArray would copy into a fresh
+        // Arena.ofAuto MemorySegment per call; padHeadDim runs every
+        // attention forward when src/target head_dim differ (Gemma 4
+        // value-head padding), so direct memory accumulates without GC
+        // pressure. Same root cause as commit 319c394.
+        val padShape = sk.ainet.lang.tensor.Shape(nKV, seq, targetHeadDim)
+        return ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(padShape, out) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            t.dtype
         )
     }
 }
diff --git a/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt b/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/MultiHeadAttention.kt
@@ -322,7 +322,15 @@ public class MultiHeadAttention<T : DType, V>(
                 data[qi * seqKV + ki] = if (allowed) 0f else neg
             }
         }
-        return ctx.fromFloatArray(Shape(1, 1, seqQ, seqKV), dtype, data)
+        // Heap-backed wrap — fromFloatArray would copy into a fresh
+        // Arena.ofAuto MemorySegment every forward (× layers using the
+        // sliding-mask path), and direct memory doesn't pressure the GC.
+        // Same root cause as the sliceView leak (commit 319c394).
+        val maskShape = Shape(1, 1, seqQ, seqKV)
+        return ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(maskShape, data) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            dtype
+        )
     }
 
     private fun repeatKVHeads(t: Tensor<T, V>, repeats: Int, ops: sk.ainet.lang.tensor.ops.TensorOps): Tensor<T, V> {
diff --git a/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/RoPE.kt b/llm-core/src/commonMain/kotlin/sk/ainet/lang/nn/transformer/RoPE.kt
@@ -191,8 +191,21 @@ public class RoPE<T : DType, V>(
             }
         }
         val cosShape = Shape(seqLen, halfRotary)
-        val cosTensor: Tensor<T, V> = ctx.fromFloatArray(cosShape, input.dtype, cosData)
-        val sinTensor: Tensor<T, V> = ctx.fromFloatArray(cosShape, input.dtype, sinData)
+        // Heap-backed wrap, NOT ctx.fromFloatArray — fromFloatArray would
+        // copy these transient cos/sin tables into fresh MemorySegments
+        // from Arena.ofAuto(). RoPE runs twice per MHA (Q, K) × every
+        // layer × every forward, and direct-memory pressure doesn't trigger
+        // GC, so the auto-arenas accumulate until -XX:MaxDirectMemorySize
+        // is exhausted. Same root-cause class as the sliceView leak
+        // (commit 319c394). Heap arrays follow normal GC.
+        val cosTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(cosShape, cosData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype
+        )
+        val sinTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(cosShape, sinData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype
+        )
 
         // Standard 2D rotation: (a, b) -> (a*cos - b*sin, a*sin + b*cos)
         val rotA = ops.subtract(ops.multiply(A, cosTensor), ops.multiply(C, sinTensor))
@@ -219,8 +232,16 @@ public class RoPE<T : DType, V>(
         }
 
         val cosShape = Shape(seqLen, halfRotary)
-        val cosTensor: Tensor<T, V> = ctx.fromFloatArray(cosShape, input.dtype, cosData)
-        val sinTensor: Tensor<T, V> = ctx.fromFloatArray(cosShape, input.dtype, sinData)
+        // Heap-backed wrap — see applyRoPESplitHalf for why fromFloatArray
+        // is poison on the hot path (direct-memory leak via Arena.ofAuto).
+        val cosTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(cosShape, cosData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype
+        )
+        val sinTensor: Tensor<T, V> = ctx.fromData(
+            sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(cosShape, sinData) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+            input.dtype
+        )
 
         val rotEven = ops.subtract(ops.multiply(even, cosTensor), ops.multiply(odd, sinTensor))
         val rotOdd = ops.add(ops.multiply(odd, cosTensor), ops.multiply(even, sinTensor))
diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaModel.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaModel.kt
@@ -128,11 +128,18 @@ public class GemmaModel<T : DType, V>(
         // onto degenerate attractor tokens during decode.
         if (finalLogitSoftcapping > 0f) {
             val ops = ctx.ops
-            val scale = ctx.fromFloatArray<T, V>(
-                sk.ainet.lang.tensor.Shape(1), dtype, floatArrayOf(1f / finalLogitSoftcapping)
+            // Heap-backed scalar wrap — fromFloatArray copies even
+            // single-float tables into a fresh Arena.ofAuto MemorySegment;
+            // running per forward step accumulates direct memory the GC
+            // can't see. Same root cause as commit 319c394.
+            val scaleShape = sk.ainet.lang.tensor.Shape(1)
+            val scale: Tensor<T, V> = ctx.fromData(
+                sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(scaleShape, floatArrayOf(1f / finalLogitSoftcapping)) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+                dtype
             )
-            val inv = ctx.fromFloatArray<T, V>(
-                sk.ainet.lang.tensor.Shape(1), dtype, floatArrayOf(finalLogitSoftcapping)
+            val inv: Tensor<T, V> = ctx.fromData(
+                sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(scaleShape, floatArrayOf(finalLogitSoftcapping)) as sk.ainet.lang.tensor.data.TensorData<T, V>,
+                dtype
             )
             logits = ops.multiply(ops.tanh(ops.multiply(logits, scale)), inv)
         }

Original file line number	Diff line number	Diff line change
`@@ -473,10 +473,15 @@ public class PaddedSharedPositionalKVCache<T : DType, V>(`
`473`	`473`	`// remaining [srcHeadDim, targetHeadDim) stays zero`
`474`	`474`	`}`
`475`	`475`	`}`
`476`		`- return ctx.fromFloatArray<T, V>(`
`477`		`- sk.ainet.lang.tensor.Shape(nKV, seq, targetHeadDim),`
`478`		`- t.dtype,`
`479`		`- out`
	`476`	`+ // Heap-backed wrap — fromFloatArray would copy into a fresh`
	`477`	`+ // Arena.ofAuto MemorySegment per call; padHeadDim runs every`
	`478`	`+ // attention forward when src/target head_dim differ (Gemma 4`
	`479`	`+ // value-head padding), so direct memory accumulates without GC`
	`480`	`+ // pressure. Same root cause as commit 319c394.`
	`481`	`+ val padShape = sk.ainet.lang.tensor.Shape(nKV, seq, targetHeadDim)`
	`482`	`+ return ctx.fromData(`
	`483`	`+ sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(padShape, out) as sk.ainet.lang.tensor.data.TensorData<T, V>,`
	`484`	`+ t.dtype`
`480`	`485`	`)`
`481`	`486`	`}`
`482`	`487`	`}`
Original file line number	Diff line number	Diff line change
`@@ -322,7 +322,15 @@ public class MultiHeadAttention<T : DType, V>(`
`322`	`322`	`data[qi * seqKV + ki] = if (allowed) 0f else neg`
`323`	`323`	`}`
`324`	`324`	`}`
`325`		`- return ctx.fromFloatArray(Shape(1, 1, seqQ, seqKV), dtype, data)`
	`325`	`+ // Heap-backed wrap — fromFloatArray would copy into a fresh`
	`326`	`+ // Arena.ofAuto MemorySegment every forward (× layers using the`
	`327`	`+ // sliding-mask path), and direct memory doesn't pressure the GC.`
	`328`	`+ // Same root cause as the sliceView leak (commit 319c394).`
	`329`	`+ val maskShape = Shape(1, 1, seqQ, seqKV)`
	`330`	`+ return ctx.fromData(`
	`331`	`+ sk.ainet.lang.tensor.data.DenseFloatArrayTensorData<T>(maskShape, data) as sk.ainet.lang.tensor.data.TensorData<T, V>,`
	`332`	`+ dtype`
	`333`	`+ )`
`326`	`334`	`}`
`327`	`335`
`328`	`336`	`private fun repeatKVHeads(t: Tensor<T, V>, repeats: Int, ops: sk.ainet.lang.tensor.ops.TensorOps): Tensor<T, V> {`