@@ -941,27 +941,43 @@ async function runTestWorkerLogic() {
941941
942942 async function testWithGpuLayers ( {
943943 modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention,
944- kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false
944+ kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false , isFirstLoad
945945 } : {
946946 modelPath : string , useMmap ?: "auto" | boolean , useDirectIo ?: boolean , gpuLayers : number , tests : number , startContextSize ?: number ,
947947 maxContextSize ?: number , minContextSize ?: number , flashAttention ?: boolean , kvCacheKeyType ?: GgmlType , kvCacheValueType ?: GgmlType ,
948948 swaFullCache ?: boolean , batchSize ?: number ,
949- evaluateText ?: string , exitAfterMeasurement ?: boolean
949+ evaluateText ?: string , exitAfterMeasurement ?: boolean ,
950+ isFirstLoad : boolean
950951 } ) {
951952 try {
952953 const preModelVramUsage = ( await llama . _getRawVramState ( ) ) . used ;
953954 const preModelRamUsage = getMemoryUsage ( llama ) ;
954- const model = await llama . loadModel ( {
955- modelPath,
956- useMmap,
957- useDirectIo,
958- gpuLayers,
959- defaultContextFlashAttention : flashAttention ,
960- experimentalDefaultContextKvCacheKeyType : kvCacheKeyType ,
961- experimentalDefaultContextKvCacheValueType : kvCacheValueType ,
962- defaultContextSwaFullCache : swaFullCache ,
963- ignoreMemorySafetyChecks : true
964- } ) ;
955+ let model : LlamaModel | undefined = undefined ;
956+
957+ for ( let triesLeft = 2 ; triesLeft > 0 ; triesLeft -- ) {
958+ try {
959+ model = await llama . loadModel ( {
960+ modelPath,
961+ useMmap,
962+ useDirectIo,
963+ gpuLayers,
964+ defaultContextFlashAttention : flashAttention ,
965+ experimentalDefaultContextKvCacheKeyType : kvCacheKeyType ,
966+ experimentalDefaultContextKvCacheValueType : kvCacheValueType ,
967+ defaultContextSwaFullCache : swaFullCache
968+ } ) ;
969+ } catch ( err ) {
970+ if ( isFirstLoad || triesLeft === 1 )
971+ throw err ;
972+
973+ // wait for the locked memory to free up before trying again
974+ await new Promise ( ( accept ) => setTimeout ( accept , 6 * 1000 ) ) ;
975+ }
976+ }
977+
978+ if ( model == null )
979+ throw new Error ( "Failed to load model" ) ;
980+
965981 const postModelVramUsage = ( await llama . _getRawVramState ( ) ) . used ;
966982 const postModelRamUsage = getMemoryUsage ( llama ) ;
967983
@@ -1044,7 +1060,8 @@ async function runTestWorkerLogic() {
10441060 swaFullCache : message . swaFullCache ,
10451061 batchSize : message . batchSize ,
10461062 evaluateText : message . evaluateText ,
1047- exitAfterMeasurement : message . exitAfterMeasurement
1063+ exitAfterMeasurement : message . exitAfterMeasurement ,
1064+ isFirstLoad : gpuLayers == message . maxGpuLayers
10481065 } ) ;
10491066
10501067 if ( measurementsDone > 0 && message . exitAfterMeasurement ) {
0 commit comments