fix: improve measure safety

giladgd · giladgd · commit fe284fee7f50 · 2026-05-20T21:16:28.000+02:00
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -941,27 +941,43 @@ async function runTestWorkerLogic() {
 
     async function testWithGpuLayers({
         modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention,
-        kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false
+        kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, isFirstLoad
     }: {
         modelPath: string, useMmap?: "auto" | boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
         maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType,
         swaFullCache?: boolean, batchSize?: number,
-        evaluateText?: string, exitAfterMeasurement?: boolean
+        evaluateText?: string, exitAfterMeasurement?: boolean,
+        isFirstLoad: boolean
     }) {
         try {
             const preModelVramUsage = (await llama._getRawVramState()).used;
             const preModelRamUsage = getMemoryUsage(llama);
-            const model = await llama.loadModel({
-                modelPath,
-                useMmap,
-                useDirectIo,
-                gpuLayers,
-                defaultContextFlashAttention: flashAttention,
-                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
-                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
-                defaultContextSwaFullCache: swaFullCache,
-                ignoreMemorySafetyChecks: true
-            });
+            let model: LlamaModel | undefined = undefined;
+
+            for (let triesLeft = 2; triesLeft > 0; triesLeft--) {
+                try {
+                    model = await llama.loadModel({
+                        modelPath,
+                        useMmap,
+                        useDirectIo,
+                        gpuLayers,
+                        defaultContextFlashAttention: flashAttention,
+                        experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                        experimentalDefaultContextKvCacheValueType: kvCacheValueType,
+                        defaultContextSwaFullCache: swaFullCache
+                    });
+                } catch (err) {
+                    if (isFirstLoad || triesLeft === 1)
+                        throw err;
+
+                    // wait for the locked memory to free up before trying again
+                    await new Promise((accept) => setTimeout(accept, 6 * 1000));
+                }
+            }
+
+            if (model == null)
+                throw new Error("Failed to load model");
+
             const postModelVramUsage = (await llama._getRawVramState()).used;
             const postModelRamUsage = getMemoryUsage(llama);
 
@@ -1044,7 +1060,8 @@ async function runTestWorkerLogic() {
                     swaFullCache: message.swaFullCache,
                     batchSize: message.batchSize,
                     evaluateText: message.evaluateText,
-                    exitAfterMeasurement: message.exitAfterMeasurement
+                    exitAfterMeasurement: message.exitAfterMeasurement,
+                    isFirstLoad: gpuLayers == message.maxGpuLayers
                 });
 
                 if (measurementsDone > 0 && message.exitAfterMeasurement) {
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -234,7 +234,8 @@ export class LlamaModel {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides
+        modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock = false, checkTensors, onLoadProgress, loadSignal,
+        metadataOverrides
     }: LlamaModelOptions & {
         gpuLayers: number,
         useMmap: boolean