fix: adapt to llama.cpp changes (#547)

giladgd · web-flow · commit 4baa480f6d85 · 2026-01-26T01:46:51.000+02:00
* fix: adapt to `llama.cpp` changes
* fix: change the level of common logs
diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp
@@ -362,6 +362,13 @@ void AddonModel::dispose() {
     }
 
     disposed = true;
+    
+    if (data != nullptr) {
+        auto currentData = data;
+        data = nullptr;
+        delete currentData;
+    }
+
     if (modelLoaded) {
         modelLoaded = false;
         llama_model_free(model);
@@ -370,12 +377,6 @@ void AddonModel::dispose() {
         loadedModelSize = 0;
     }
 
-    if (data != nullptr) {
-        auto currentData = data;
-        data = nullptr;
-        delete currentData;
-    }
-
     if (hasAddonExportsRef) {
         addonExportsRef.Unref();
         hasAddonExportsRef = false;
diff --git a/llama/addon/AddonModelLora.cpp b/llama/addon/AddonModelLora.cpp
@@ -53,10 +53,8 @@ AddonModelLora::~AddonModelLora() {
 
 void AddonModelLora::dispose(bool skipErase) {
     if (lora_adapter != nullptr) {
-        auto loraAdapterToDispose = lora_adapter;
         lora_adapter = nullptr;
-        llama_adapter_lora_free(loraAdapterToDispose);
-        
+
         if (!skipErase && model->data != nullptr) {
             model->data->removeLora(this);
         }
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
@@ -690,6 +690,10 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil
         return LlamaLogLevel.info;
     else if (level === LlamaLogLevel.warn && message.startsWith("llama_init_from_model: model default pooling_type is [0], but [-1] was specified"))
         return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is enabled, disabling mmap"))
+        return LlamaLogLevel.info;
+    else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is not available, using mmap"))
+        return LlamaLogLevel.info;
     else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
         return LlamaLogLevel.info;
     else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for"))
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -60,7 +60,6 @@ export class LlamaContext {
     /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
     /** @internal */ private readonly _modelPreventDisposalHandle: DisposalPreventionHandle;
     /** @internal */ private readonly _loraAdapters = new Set<AddonModelLora>();
-    /** @internal */ private readonly _gcRegistry: FinalizationRegistry<Set<AddonModelLora>>;
     /** @internal */ private _nextGeneratedSequenceId = 0;
     /** @internal */ private _dispatchDecodeScheduled = false;
     /** @internal */ private _batchDispatchPending = false;
@@ -146,30 +145,20 @@ export class LlamaContext {
             dispatchSchedule: batchingDispatchSchedule,
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy
         };
-        this._gcRegistry = new FinalizationRegistry(this._model._removeLoraUsage);
-        this._gcRegistry.register(this, this._loraAdapters);
 
         this._reclaimUnusedSequenceId = this._reclaimUnusedSequenceId.bind(this);
         this._freeReservedThreads = this._freeReservedThreads.bind(this);
 
         this._disposeAggregator.add(() => {
             this._disposed = true;
         });
-        this._disposeAggregator.add(() => void this._gcRegistry.unregister(this));
         this._disposeAggregator.add(this._onReclaimUnusedSequenceId);
         this._disposeAggregator.add(this.onDispose.dispatchEvent);
         this._disposeAggregator.add(
             this.model.onDispose.createListener(
                 disposeContextIfReferenced.bind(null, new WeakRef(this))
             )
         );
-        this._disposeAggregator.add((): Promise<void> | void => {
-            if (this._loraAdapters.size > 0) {
-                const loraAdapters = new Set(this._loraAdapters);
-                this._loraAdapters.clear();
-                return this._model._removeLoraUsage(loraAdapters);
-            }
-        });
 
         this._disposeAggregator.add(async () => {
             await this._backendContextDisposeGuard.acquireDisposeLock();
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -126,6 +126,8 @@ export type LlamaContextOptions = {
      * without the need for extensive retraining from scratch.
      *
      * If a string is provided, it will be treated as a path to a single LoRA adapter file.
+     *
+     * The adapters will be released from memory once the model (not just the context) is disposed.
      */
     lora?: string | {
         adapters: Array<{
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -275,8 +275,6 @@ export class LlamaModel {
             this._llamaPreventDisposalHandle.dispose();
         });
 
-        this._removeLoraUsage = this._removeLoraUsage.bind(this);
-
         this.tokenize = this.tokenize.bind(this);
         this.detokenize = this.detokenize.bind(this);
         this.isSpecialToken = this.isSpecialToken.bind(this);
@@ -703,22 +701,6 @@ export class LlamaModel {
         });
     }
 
-    /** @internal */
-    public async _removeLoraUsage(loraAdapters: Set<AddonModelLora>) {
-        return await withLock([this._loraAdapters, "modify"], async () => {
-            await Promise.all(
-                [...loraAdapters].map(async (lora) => {
-                    lora.usages--;
-
-                    if (lora.usages <= 0 && this._loraAdapters.get(lora.filePath) === lora) {
-                        this._loraAdapters.delete(lora.filePath);
-                        await lora.dispose();
-                    }
-                })
-            );
-        });
-    }
-
     /** @internal */
     public static async _create(modelOptions: LlamaModelOptions, {
         _llama

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,8 @@ export type LlamaContextOptions = {`
`126`	`126`	`* without the need for extensive retraining from scratch.`
`127`	`127`	`*`
`128`	`128`	`* If a string is provided, it will be treated as a path to a single LoRA adapter file.`
	`129`	`+ *`
	`130`	`+ * The adapters will be released from memory once the model (not just the context) is disposed.`
`129`	`131`	`*/`
`130`	`132`	`lora?: string \| {`
`131`	`133`	`adapters: Array<{`