feat(gemma): optional maxInferenceLen on GemmaNetworkLoader.load() (#178)

michalharakal · claude · michalharakal · commit 4acec7f45223 · 2026-06-15T15:42:09.000+02:00
The eager network sizes its KV cache + RoPE tables for maxInferenceLen
(= min(contextLength, 4096) by default). On the 1.9 GB SL2610 that ~0.4 GB
KV cache (allocated at the first forward) OOMs the board even after the
packed Q8_0 lm_head dropped the weight footprint to ~1.06 GB resident.

Thread an optional `maxInferenceLen: Int? = null` through
load() -&gt; applyWeightsToNetwork -&gt; applyWeightsToNetworkNonReified -&gt;
gemmaNetwork so a constrained-device consumer can cap the context (e.g. 32
for a short tool-call prompt), shrinking the KV cache ~100x. Default null
preserves the existing min(contextLength, 4096) behaviour.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt
@@ -120,7 +120,8 @@ public class GemmaNetworkLoader @PublishedApi internal constructor(
      * Load weights and build a fully initialized DSL network.
      */
     public suspend inline fun <reified T : DType, V> load(
-        ctx: ExecutionContext
+        ctx: ExecutionContext,
+        maxInferenceLen: Int? = null,
     ): Module<T, V> {
         val rawWeights: Gemma4Weights<T, V> = when (val wp = weightsProvider) {
             is WeightsProvider.GgufSource -> {
@@ -160,14 +161,15 @@ public class GemmaNetworkLoader @PublishedApi internal constructor(
                 rawWeights
             }
 
-        return applyWeightsToNetwork(ctx, weights)
+        return applyWeightsToNetwork(ctx, weights, maxInferenceLen)
     }
 
     @PublishedApi
     internal inline fun <reified T : DType, V> applyWeightsToNetwork(
         ctx: ExecutionContext,
-        weights: Gemma4Weights<T, V>
-    ): Module<T, V> = applyWeightsToNetworkNonReified(ctx, weights, T::class, debug)
+        weights: Gemma4Weights<T, V>,
+        maxInferenceLen: Int? = null,
+    ): Module<T, V> = applyWeightsToNetworkNonReified(ctx, weights, T::class, debug, maxInferenceLen)
 }
 
 /** Shared non-reified impl used by both the inline-reified companion helpers
@@ -177,7 +179,8 @@ internal fun <T : DType, V> applyWeightsToNetworkNonReified(
     ctx: ExecutionContext,
     weights: Gemma4Weights<T, V>,
     dtype: kotlin.reflect.KClass<T>,
-    debug: Boolean
+    debug: Boolean,
+    maxInferenceLen: Int? = null,
 ): Module<T, V> {
     // Enable optional Gemma 4 features iff the checkpoint actually carries
     // their weights. Real Gemma 4 GGUFs do; synthetic toy-model tests do not,
@@ -197,6 +200,7 @@ internal fun <T : DType, V> applyWeightsToNetworkNonReified(
     val model = gemmaNetwork<T, V>(
         weights.metadata,
         dtype,
+        maxInferenceLen = maxInferenceLen ?: minOf(weights.metadata.contextLength, 4096),
         qkNorm = hasQKNorm,
         sandwichNorms = hasSandwichNorms,
         layerOutputScale = hasLayerOutputScale,