Skip to content

Commit fe284fe

Browse files
committed
fix: improve measure safety
1 parent 547c692 commit fe284fe

2 files changed

Lines changed: 33 additions & 15 deletions

File tree

src/cli/commands/inspect/commands/InspectMeasureCommand.ts

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -941,27 +941,43 @@ async function runTestWorkerLogic() {
941941

942942
async function testWithGpuLayers({
943943
modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention,
944-
kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false
944+
kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, isFirstLoad
945945
}: {
946946
modelPath: string, useMmap?: "auto" | boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number,
947947
maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType,
948948
swaFullCache?: boolean, batchSize?: number,
949-
evaluateText?: string, exitAfterMeasurement?: boolean
949+
evaluateText?: string, exitAfterMeasurement?: boolean,
950+
isFirstLoad: boolean
950951
}) {
951952
try {
952953
const preModelVramUsage = (await llama._getRawVramState()).used;
953954
const preModelRamUsage = getMemoryUsage(llama);
954-
const model = await llama.loadModel({
955-
modelPath,
956-
useMmap,
957-
useDirectIo,
958-
gpuLayers,
959-
defaultContextFlashAttention: flashAttention,
960-
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
961-
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
962-
defaultContextSwaFullCache: swaFullCache,
963-
ignoreMemorySafetyChecks: true
964-
});
955+
let model: LlamaModel | undefined = undefined;
956+
957+
for (let triesLeft = 2; triesLeft > 0; triesLeft--) {
958+
try {
959+
model = await llama.loadModel({
960+
modelPath,
961+
useMmap,
962+
useDirectIo,
963+
gpuLayers,
964+
defaultContextFlashAttention: flashAttention,
965+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
966+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
967+
defaultContextSwaFullCache: swaFullCache
968+
});
969+
} catch (err) {
970+
if (isFirstLoad || triesLeft === 1)
971+
throw err;
972+
973+
// wait for the locked memory to free up before trying again
974+
await new Promise((accept) => setTimeout(accept, 6 * 1000));
975+
}
976+
}
977+
978+
if (model == null)
979+
throw new Error("Failed to load model");
980+
965981
const postModelVramUsage = (await llama._getRawVramState()).used;
966982
const postModelRamUsage = getMemoryUsage(llama);
967983

@@ -1044,7 +1060,8 @@ async function runTestWorkerLogic() {
10441060
swaFullCache: message.swaFullCache,
10451061
batchSize: message.batchSize,
10461062
evaluateText: message.evaluateText,
1047-
exitAfterMeasurement: message.exitAfterMeasurement
1063+
exitAfterMeasurement: message.exitAfterMeasurement,
1064+
isFirstLoad: gpuLayers == message.maxGpuLayers
10481065
});
10491066

10501067
if (measurementsDone > 0 && message.exitAfterMeasurement) {

src/evaluator/LlamaModel/LlamaModel.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,8 @@ export class LlamaModel {
234234
public readonly onDispose = new EventRelay<void>();
235235

236236
private constructor({
237-
modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides
237+
modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock = false, checkTensors, onLoadProgress, loadSignal,
238+
metadataOverrides
238239
}: LlamaModelOptions & {
239240
gpuLayers: number,
240241
useMmap: boolean

0 commit comments

Comments
 (0)