Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6191ee2
fix: Qwen 3.5 memory estimation
giladgd Mar 6, 2026
140d888
fix: bugs
giladgd Mar 6, 2026
25effb9
fix: grammar use with HarmonyChatWrapper
giladgd Mar 8, 2026
48e5ed4
fix: Qwen 3.5 function calling syntax detection
giladgd Mar 10, 2026
8b284f4
feat(`QwenChatWrapper`): support Qwen 3.5
giladgd Mar 10, 2026
ab65037
feat: automatic checkpoints for models that need it
giladgd Mar 12, 2026
07fa753
fix: add mistral think segment detection
giladgd Mar 12, 2026
8f7209e
feat: initial disk cache dir option for future optimizations (disable…
giladgd Mar 12, 2026
1bb4a78
test: Qwen 3.5 checkpoint use
giladgd Mar 12, 2026
e121326
fix: bugs
giladgd Mar 12, 2026
d9bc17b
test: fix test
giladgd Mar 12, 2026
2a93abe
feat(`inspect gpu` command): detect and report missing prebuilt binar…
giladgd Mar 13, 2026
d2d38d9
test: fix test
giladgd Mar 13, 2026
f2908ca
fix: checkpoints memory restoration
giladgd Mar 14, 2026
4664976
fix: bug
giladgd Mar 14, 2026
12ce825
chore: update modules
giladgd Mar 14, 2026
82fa13c
test: better crash logs on test process crash
giladgd Mar 14, 2026
89b293f
fix: compress excessively long segments from the current response on …
giladgd Mar 14, 2026
28a236b
fix: default thinking budget to 75% of the context size to avoid low …
giladgd Mar 14, 2026
8c9c764
chore: update `package-lock.json`
giladgd Mar 14, 2026
b4b978d
fix(`chat` command): default reasoning budget option value description
giladgd Mar 14, 2026
841b122
docs: update awesome list
giladgd Mar 14, 2026
1868c02
test: fix the CI test runner
giladgd Mar 14, 2026
1b33d2c
fix: bugs
giladgd Mar 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 155 additions & 0 deletions llama/addon/AddonContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,61 @@ Napi::Value AddonContext::SetLoras(const Napi::CallbackInfo& info) {
return info.Env().Undefined();
}

class RestoreCheckpointWorker : public Napi::AsyncWorker {
public:
AddonContext* context;
AddonContextSequenceCheckpoint* checkpoint;
bool restoreSuccess = false;

RestoreCheckpointWorker(const Napi::CallbackInfo& info, AddonContext* context, AddonContextSequenceCheckpoint* checkpoint)
: Napi::AsyncWorker(info.Env(), "RestoreCheckpointWorker"),
context(context),
checkpoint(checkpoint),
deferred(Napi::Promise::Deferred::New(info.Env())) {
context->Ref();
checkpoint->Ref();
}
~RestoreCheckpointWorker() {
context->Unref();
checkpoint->Unref();
}

Napi::Promise GetPromise() {
return deferred.Promise();
}

protected:
Napi::Promise::Deferred deferred;

void Execute() {
try {
std::lock_guard<std::mutex> lock(checkpoint->dataMutex);

std::size_t dataSize = checkpoint->data.size();
std::size_t restoreSize = llama_state_seq_set_data_ext(context->ctx, checkpoint->data.data(), dataSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
restoreSuccess = restoreSize == dataSize;
} catch (const std::exception& e) {
SetError(e.what());
} catch(...) {
SetError("Unknown error when calling \"llama_state_seq_set_data_ext\"");
}
}
void OnOK() {
deferred.Resolve(Napi::Boolean::New(Env(), restoreSuccess));
}
void OnError(const Napi::Error& err) {
deferred.Reject(err.Value());
}
};

Napi::Value AddonContext::RestoreCheckpoint(const Napi::CallbackInfo& info) {
AddonContextSequenceCheckpoint* checkpoint = Napi::ObjectWrap<AddonContextSequenceCheckpoint>::Unwrap(info[0].As<Napi::Object>());

RestoreCheckpointWorker* worker = new RestoreCheckpointWorker(info, this, checkpoint);
worker->Queue();
return worker->GetPromise();
}

void AddonContext::init(Napi::Object exports) {
exports.Set(
"AddonContext",
Expand Down Expand Up @@ -992,8 +1047,108 @@ void AddonContext::init(Napi::Object exports) {
InstanceMethod("saveSequenceStateToFile", &AddonContext::SaveSequenceStateToFile),
InstanceMethod("loadSequenceStateFromFile", &AddonContext::LoadSequenceStateFromFile),
InstanceMethod("setLoras", &AddonContext::SetLoras),
InstanceMethod("restoreCheckpoint", &AddonContext::RestoreCheckpoint),
InstanceMethod("dispose", &AddonContext::Dispose),
}
)
);
}

AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {

}
AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
dispose();
}

class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
public:
AddonContextSequenceCheckpoint* checkpoint;
AddonContext* context;

AddonContextSequenceCheckpointInitWorker(const Napi::CallbackInfo& info, AddonContextSequenceCheckpoint* checkpoint, AddonContext* context)
: Napi::AsyncWorker(info.Env(), "AddonContextSequenceCheckpointInitWorker"),
checkpoint(checkpoint),
context(context),
deferred(Napi::Promise::Deferred::New(info.Env())) {
checkpoint->Ref();
context->Ref();
}
~AddonContextSequenceCheckpointInitWorker() {
checkpoint->Unref();
context->Unref();
}

Napi::Promise GetPromise() {
return deferred.Promise();
}

protected:
Napi::Promise::Deferred deferred;

void Execute() {
try {
checkpoint->index = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

checkpoint->data.reserve(checkpointSize);
checkpoint->data.resize(checkpointSize);
llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
} catch (const std::exception& e) {
SetError(e.what());
} catch(...) {
SetError("Unknown error when calling \"llama_state_seq_get_data_ext\"");
}
}
void OnOK() {
deferred.Resolve(Env().Undefined());
}
void OnError(const Napi::Error& err) {
deferred.Reject(err.Value());
}
};

Napi::Value AddonContextSequenceCheckpoint::Init(const Napi::CallbackInfo& info) {
AddonContext * context = Napi::ObjectWrap<AddonContext>::Unwrap(info[0].As<Napi::Object>());
sequenceId = info[1].As<Napi::Number>().Int32Value();

AddonContextSequenceCheckpointInitWorker* worker = new AddonContextSequenceCheckpointInitWorker(info, this, context);
worker->Queue();
return worker->GetPromise();
}

Napi::Value AddonContextSequenceCheckpoint::Dispose(const Napi::CallbackInfo& info) {
dispose();
return info.Env().Undefined();
}

void AddonContextSequenceCheckpoint::dispose() {
std::lock_guard<std::mutex> lock(dataMutex);
data.clear();
data.resize(0);
}

Napi::Value AddonContextSequenceCheckpoint::GetSize(const Napi::CallbackInfo& info) {
return Napi::Number::New(info.Env(), data.size());
}

Napi::Value AddonContextSequenceCheckpoint::GetIndex(const Napi::CallbackInfo& info) {
return Napi::Number::New(info.Env(), index);
}

void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
exports.Set(
"AddonContextSequenceCheckpoint",
DefineClass(
exports.Env(),
"AddonContextSequenceCheckpoint",
{
InstanceMethod("init", &AddonContextSequenceCheckpoint::Init),
InstanceMethod("dispose", &AddonContextSequenceCheckpoint::Dispose),

InstanceAccessor("size", &AddonContextSequenceCheckpoint::GetSize, nullptr),
InstanceAccessor("index", &AddonContextSequenceCheckpoint::GetIndex, nullptr),
}
)
);
}
25 changes: 25 additions & 0 deletions llama/addon/AddonContext.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#pragma once

#include <mutex>

#include "llama.h"
#include "napi.h"
#include "addonGlobals.h"
Expand Down Expand Up @@ -53,6 +56,28 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
Napi::Value EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info);

Napi::Value SetLoras(const Napi::CallbackInfo& info);
Napi::Value RestoreCheckpoint(const Napi::CallbackInfo& info);

static void init(Napi::Object exports);
};

class AddonContextSequenceCheckpoint : public Napi::ObjectWrap<AddonContextSequenceCheckpoint> {
public:
std::mutex dataMutex;
std::vector<uint8_t> data;
llama_seq_id sequenceId = 0;
std::size_t index = 0;

AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info);
~AddonContextSequenceCheckpoint();

Napi::Value Init(const Napi::CallbackInfo& info);
Napi::Value Dispose(const Napi::CallbackInfo& info);

void dispose();

Napi::Value GetSize(const Napi::CallbackInfo& info);
Napi::Value GetIndex(const Napi::CallbackInfo& info);

static void init(Napi::Object exports);
};
1 change: 1 addition & 0 deletions llama/addon/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
AddonGrammar::init(exports);
AddonGrammarEvaluationState::init(exports);
AddonContext::init(exports);
AddonContextSequenceCheckpoint::init(exports);
AddonSampler::init(exports);

llama_log_set(addonLlamaCppLogCallback, nullptr);
Expand Down
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
"zx": "^8.8.5"
},
"dependencies": {
"@huggingface/jinja": "^0.5.5",
"@huggingface/jinja": "^0.5.6",
"async-retry": "^1.3.3",
"bytes": "^3.1.2",
"chalk": "^5.6.2",
Expand Down
15 changes: 14 additions & 1 deletion src/bindings/AddonTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ export type BindingModule = {
swaFullCache?: boolean
}): AddonContext
},
AddonContextSequenceCheckpoint: {
new (): AddonContextSequenceCheckpoint
},
AddonGrammar: {
new (grammarPath: string, params?: {
addonExports?: BindingModule,
Expand Down Expand Up @@ -159,7 +162,17 @@ export type AddonContext = {
ensureDraftContextIsCompatibleForSpeculative(draftContext: AddonContext): void,
saveSequenceStateToFile(filePath: string, sequenceId: number, tokens: Uint32Array): Promise<number>,
loadSequenceStateFromFile(filePath: string, sequenceId: number, maxContextSize: number): Promise<Uint32Array>,
setLoras(loras: AddonModelLora[], scales: number[]): void
setLoras(loras: AddonModelLora[], scales: number[]): void,

restoreCheckpoint(checkpoint: AddonContextSequenceCheckpoint): Promise<boolean>
};

export type AddonContextSequenceCheckpoint = {
init(context: AddonContext, sequenceId: number): Promise<void>,
dispose(): void,

get size(): number,
get index(): number
};

export type BatchLogitIndex = number & {
Expand Down
31 changes: 28 additions & 3 deletions src/bindings/Llama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {LlamaJsonSchemaGrammar} from "../evaluator/LlamaJsonSchemaGrammar.js";
import {LlamaGrammar, LlamaGrammarOptions} from "../evaluator/LlamaGrammar.js";
import {ThreadsSplitter} from "../utils/ThreadsSplitter.js";
import {getLlamaClasses, LlamaClasses} from "../utils/getLlamaClasses.js";
import {getTempDir, FsPathHandle} from "../utils/getTempDir.js";
import {BindingModule} from "./AddonTypes.js";
import {
BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel,
Expand Down Expand Up @@ -46,6 +47,8 @@ export class Llama {
/** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator;
/** @internal */ public readonly _debug: boolean;
/** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
/** @internal */ public readonly _tempDir?: FsPathHandle;
/** @internal */ private _tempDirNextId: number = 0;
/** @internal */ public _hadErrorLogs: boolean = false;
/** @internal */ private readonly _gpu: LlamaGpuType;
/** @internal */ private readonly _numa: LlamaNuma;
Expand Down Expand Up @@ -74,7 +77,7 @@ export class Llama {
public readonly onDispose = new EventRelay<void>();

private constructor({
bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu,
bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, tempDir, numa, buildGpu,
maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit
}: {
bindings: BindingModule,
Expand All @@ -89,6 +92,7 @@ export class Llama {
release: string
},
debug: boolean,
tempDir?: FsPathHandle,
numa?: LlamaNuma,
buildGpu: BuildGpu,
maxThreads?: number,
Expand All @@ -104,6 +108,7 @@ export class Llama {

this._bindings = bindings;
this._debug = debug;
this._tempDir = tempDir;
this._numa = numa ?? false;
this._logLevel = this._debug
? LlamaLogLevel.debug
Expand Down Expand Up @@ -175,6 +180,7 @@ export class Llama {
this.onDispose.dispatchEvent();
await this._backendDisposeGuard.acquireDisposeLock();
await this._bindings.dispose();
await this._tempDir?.dispose();

process.off("beforeExit", this._onBeforeExit);
unregisterDisposeBeforeExit(this._selfWeakRef);
Expand Down Expand Up @@ -413,6 +419,15 @@ export class Llama {
this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
}

/** @internal */
public _createTempFilePath() {
if (this._tempDir == null)
return undefined;

const fileId = this._tempDirNextId++;
return new FsPathHandle(path.join(this._tempDir.path, fileId + ".nlc"));
}

/** @internal */
private _onAddonLog(level: number, message: string) {
const llamaLogLevel = addonLogLevelToLlamaLogLevel.get(level) ?? LlamaLogLevel.fatal;
Expand Down Expand Up @@ -507,7 +522,7 @@ export class Llama {
/** @internal */
public static async _create({
bindings, bindingPath, extBackendsPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads,
skipLlamaInit = false, debug, numa
skipLlamaInit = false, debug, numa, tempDir
}: {
bindings: BindingModule,
bindingPath: string,
Expand All @@ -521,7 +536,8 @@ export class Llama {
ramPadding: number | ((totalRam: number) => number),
skipLlamaInit?: boolean,
debug: boolean,
numa?: LlamaNuma
numa?: LlamaNuma,
tempDir?: string | string[] | false
}) {
const vramOrchestrator = new MemoryOrchestrator(() => {
const {total, used, unifiedSize} = bindings.getGpuVramInfo();
Expand Down Expand Up @@ -566,6 +582,14 @@ export class Llama {
else
resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding);

const resolvedTempDir = tempDir === false
? undefined
: await getTempDir(
typeof tempDir === "string"
? [tempDir]
: tempDir
);

const llama = new Llama({
bindings,
bindingPath,
Expand All @@ -579,6 +603,7 @@ export class Llama {
logLevel,
logger,
debug,
tempDir: resolvedTempDir,
numa,
buildGpu: buildMetadata.buildOptions.gpu,
vramOrchestrator,
Expand Down
Loading
Loading