fix: pad the context size to align with the implementation in llama.cpp

giladgd · giladgd · commit e70bcd00cd27 · 2025-11-28T21:11:59.000+02:00
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -21,6 +21,7 @@ import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDes
 import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {toBytes} from "../../../utils/toBytes.js";
+import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -952,6 +953,8 @@ function getContextSizesCheckPlan(trainContextSize: number, tests: number = 10,
         if (size < 2)
             size = 2;
 
+        size = padSafeContextSize(size, "up");
+
         if (res[res.length - 1] === size) {
             shouldStop = true;
             return;
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -22,6 +22,7 @@ import {
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
 import {TokenPredictor} from "./TokenPredictor.js";
+import {padSafeContextSize} from "./utils/padSafeContextSize.js";
 import type {Llama} from "../../bindings/Llama.js";
 
 const defaultLoraScale = 1;
@@ -98,12 +99,15 @@ export class LlamaContext {
         if (_model.disposed)
             throw new DisposedError();
 
+        const kvUnified = false;
         this._llama = _model._llama;
         this._model = _model;
         this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
         this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
         this._totalSequences = Math.max(1, Math.floor(sequences));
-        this._contextSize = Math.max(2, contextSize);
+        this._contextSize = kvUnified
+            ? Math.floor(padSafeContextSize(Math.max(2, contextSize) * this._totalSequences, "up") / this._totalSequences)
+            : padSafeContextSize(Math.max(2, contextSize), "up");
         this._batchSize = Math.max(batchSize, this._totalSequences);
         this._flashAttention = flashAttention;
         this._idealThreads = typeof threads === "number"
@@ -124,7 +128,7 @@ export class LlamaContext {
         this._performanceTracking = !!performanceTracking;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
-            contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
+            contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize + (
                 (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0)
                     ? 1 // +1 to handle edge cases with SWA KV cache
diff --git a/src/evaluator/LlamaContext/utils/padSafeContextSize.ts b/src/evaluator/LlamaContext/utils/padSafeContextSize.ts
@@ -0,0 +1,20 @@
+const contextSizePad = 256;
+
+export function padSafeContextSize(value: number, padDirection: "up" | "down", padding: number = contextSizePad) {
+    const paddedSize = ggmlPad(value, padding);
+
+    if (paddedSize === value)
+        return value;
+    else if (padDirection === "up")
+        return paddedSize;
+    else if (padDirection === "down") {
+        const smallerPaddedSize = ggmlPad(value - padding, padding);
+        if (smallerPaddedSize >= padding)
+            return smallerPaddedSize;
+    }
+
+    return paddedSize;
+}
+function ggmlPad(value: number, padding: number): number {
+    return ((value + padding - 1) & ~(padding - 1));
+}
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
@@ -5,6 +5,7 @@ import {GgufFileInfo} from "../types/GgufFileInfoTypes.js";
 import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {GgufArchitectureType} from "../types/GgufMetadataTypes.js";
 import {getReadablePath} from "../../cli/utils/getReadablePath.js";
+import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
 import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js";
 import {GgufInsightsTokens} from "./GgufInsightsTokens.js";
 
@@ -211,6 +212,7 @@ export class GgufInsights {
         const llmData = this._ggufFileInfo.architectureMetadata;
         const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
         const slidingWindow = this.swaSize ?? 0;
+        const kvUnified = false;
         const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
             (this.trainContextSize == null || slidingWindow < this.trainContextSize);
         const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture);
@@ -220,7 +222,9 @@ export class GgufInsights {
 
         // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
         const kvCachePadding = 1;
-        const actualContextSize = sequences * contextSize;
+        const actualContextSize = kvUnified
+            ? padSafeContextSize(sequences * contextSize, "up")
+            : sequences * padSafeContextSize(contextSize, "up");
         const kvSize = usingSWA
             ? (
                 (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts