feat(bert): wire ScratchPool into the embedding hot path

michalharakal · claude · michalharakal · commit ce8b9eecbb1e · 2026-04-29T10:34:16.000+02:00
Two changes that activate SKaiNET 0.21.0's ScratchPool SPI for the BERT
encoder workload:

1. Wrap BertRuntime.forward in ctx.scratch.scope { ... }. Upstream
   SIMD kernels (matmul, dequant) acquire workspace from ctx.scratch
   internally; the scope drains acquired buffers back to the pool on
   exit. With the default NoopScratchPool this is a pass-through; with a
   real pool it eliminates per-forward FloatArray allocations on what is
   typically the busiest path for an embedding workload (encode() called
   many times in a row).

2. Add PooledExecutionContext — a thin ExecutionContext delegate that
   provides a SizeClassedScratchPool. Wire it as the default ctx in
   KBertJava.loadSafeTensors, since Java embedding consumers virtually
   always batch many encode() calls.

Default behavior is preserved: callers that construct BertRuntime with
a plain DirectCpuExecutionContext (no PooledExecutionContext wrapper)
continue to use NoopScratchPool and see no change.

22/22 BertRuntime + BertNumericalAccuracy + HuggingFaceTokenizer tests
green on JDK 25.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/llm-inference/bert/src/commonMain/kotlin/sk/ainet/models/bert/BertRuntime.kt b/llm-inference/bert/src/commonMain/kotlin/sk/ainet/models/bert/BertRuntime.kt
@@ -110,27 +110,34 @@ public class BertRuntime<T : DType>(
      * @param tokenTypeIds optional segment IDs, shape [seqLen] (defaults to all zeros)
      * @return hidden states tensor of shape [seqLen, hiddenSize]
      */
-    public fun forward(tokenIds: IntArray, tokenTypeIds: IntArray? = null): Tensor<T, Float> {
-        val seqLen = tokenIds.size
-        val typeIds = tokenTypeIds ?: IntArray(seqLen) { 0 }
-        val positionIds = IntArray(seqLen) { it }
-
-        // Embedding: word + position + token_type
-        val wordEmb = wordEmbedding.forward(tokenIds, ctx)
-        val posEmb = positionEmbedding.forward(positionIds, ctx)
-        val typeEmb = tokenTypeEmbedding.forward(typeIds, ctx)
-
-        var hidden = wordEmb + posEmb + typeEmb
-        hidden = embeddingLayerNorm.forward(hidden, ctx)
-
-        // Encoder layers
-        for (i in weights.layers.indices) {
-            hidden = runEncoderLayer(i, hidden)
+    public fun forward(tokenIds: IntArray, tokenTypeIds: IntArray? = null): Tensor<T, Float> =
+        ctx.scratch.scope {
+            // ScratchPool scope for the whole forward pass: upstream SIMD
+            // kernels (matmul, dequant) acquire their per-call workspace
+            // from ctx.scratch and the buffers are returned to the pool on
+            // scope exit. With the default NoopScratchPool this is a plain
+            // pass-through; with a SizeClassedScratchPool it eliminates per-
+            // forward FloatArray allocations on the embedding hot path.
+            val seqLen = tokenIds.size
+            val typeIds = tokenTypeIds ?: IntArray(seqLen) { 0 }
+            val positionIds = IntArray(seqLen) { it }
+
+            // Embedding: word + position + token_type
+            val wordEmb = wordEmbedding.forward(tokenIds, ctx)
+            val posEmb = positionEmbedding.forward(positionIds, ctx)
+            val typeEmb = tokenTypeEmbedding.forward(typeIds, ctx)
+
+            var hidden = wordEmb + posEmb + typeEmb
+            hidden = embeddingLayerNorm.forward(hidden, ctx)
+
+            // Encoder layers
+            for (i in weights.layers.indices) {
+                hidden = runEncoderLayer(i, hidden)
+            }
+
+            hidden
         }
 
-        return hidden
-    }
-
     /**
      * Encode text tokens into a single embedding vector (mean pooling + optional projection + L2 norm).
      *
diff --git a/llm-inference/bert/src/commonMain/kotlin/sk/ainet/models/bert/PooledExecutionContext.kt b/llm-inference/bert/src/commonMain/kotlin/sk/ainet/models/bert/PooledExecutionContext.kt
@@ -0,0 +1,35 @@
+package sk.ainet.models.bert
+
+import sk.ainet.context.ExecutionContext
+import sk.ainet.lang.tensor.scratch.ScratchPool
+import sk.ainet.lang.tensor.scratch.SizeClassedScratchPool
+
+/**
+ * Wraps an [ExecutionContext] with a [SizeClassedScratchPool] so that
+ * upstream SIMD kernels and per-forward intermediates are pooled across
+ * encoder calls.
+ *
+ * Use this when you intend to compute many embeddings from the same model:
+ *
+ * ```kotlin
+ * val baseCtx = DirectCpuExecutionContext(tensorDataFactory = memSegFactory)
+ * val pooledCtx = PooledExecutionContext(baseCtx)
+ *
+ * val runtime = BertRuntime(pooledCtx, weights, FP32::class)
+ *
+ * // Each forward acquires + releases scratch buffers in a per-call scope.
+ * val v1 = runtime.encode(tokens1)
+ * val v2 = runtime.encode(tokens2)   // reuses pooled buffers
+ * ```
+ *
+ * For one-shot use the default `NoopScratchPool` on a plain
+ * `DirectCpuExecutionContext` is fine — pooling has no benefit when the
+ * pool is never reused.
+ *
+ * **Threading:** `SizeClassedScratchPool` is single-threaded by intent.
+ * Concurrent encoder calls must each have their own pooled context.
+ */
+public class PooledExecutionContext(
+    private val delegate: ExecutionContext,
+    override val scratch: ScratchPool = SizeClassedScratchPool(),
+) : ExecutionContext by delegate
diff --git a/llm-inference/bert/src/jvmMain/kotlin/sk/ainet/models/bert/java/KBertJava.kt b/llm-inference/bert/src/jvmMain/kotlin/sk/ainet/models/bert/java/KBertJava.kt
@@ -5,6 +5,7 @@ package sk.ainet.models.bert.java
 import kotlinx.coroutines.runBlocking
 import sk.ainet.models.bert.*
 import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.models.bert.PooledExecutionContext
 import sk.ainet.io.JvmRandomAccessSource
 import sk.ainet.io.safetensors.SafeTensorsParametersLoader
 import sk.ainet.lang.types.FP32
@@ -46,7 +47,11 @@ public object KBertJava {
         val config = detectConfig(modelDir)
 
         val tokenizer = HuggingFaceTokenizer.fromVocabTxt(vocabPath.readText())
-        val ctx = DirectCpuExecutionContext()
+        // Pool scratch buffers across encode() calls — embedding workloads
+        // typically encode many strings in a row, so the SizeClassedScratchPool
+        // returns real wins. With a single one-shot call the pool is no
+        // worse than NoopScratchPool.
+        val ctx = PooledExecutionContext(DirectCpuExecutionContext())
 
         val ingestion = BertIngestion<FP32>(ctx, FP32::class, config)
         val loader = SafeTensorsParametersLoader(