Skip to content

Commit c641959

Browse files
authored
feat: automatic checkpoints for models that need it (#573)
* feat: automatic checkpoints for models that need it (such as Qwen 3.5 due to its hybrid architecture) * feat(`QwenChatWrapper`): Qwen 3.5 support * feat(`inspect gpu` command): detect and report missing prebuilt binary modules and custom npm registry * feat: initial disk cache dir option for future optimizations (disabled for now) * fix: Qwen 3.5 memory estimation * fix: grammar use with HarmonyChatWrapper * fix: add mistral think segment detection * fix: compress excessively long segments from the current response on context shift instead of throwing an error * fix: default thinking budget to 75% of the context size to prevent low-quality responses * fix: bugs
1 parent cc105b9 commit c641959

35 files changed

Lines changed: 5999 additions & 3899 deletions

docs/guide/awesome.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
1212
* [CatAI](https://github.com/withcatai/catai) - a simplified AI assistant API for Node.js, with REST API support
1313
<br /><DataBadge title="License" content="MIT"/>
1414

15-
* [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
16-
<br /><DataBadge title="License" content="AGPL-3.0"/>
15+
* [QMD](https://github.com/tobi/qmd) (Query Markup Documents) - an on-device search engine for your markdown notes, meeting transcripts, documentation, and knowledge bases. Search with keywords or natural language
16+
<br /><DataBadge title="License" content="MIT"/>
1717

1818
* [Clippy](https://felixrieseberg.github.io/clippy/) ([GitHub](https://github.com/felixrieseberg/clippy)) - Clippy, resurrected from the 1990s, now with some AI
1919
<br /><DataBadge title="License" content="MIT"/>
@@ -25,6 +25,8 @@ import DataBadge from "../../.vitepress/components/DataBadge/DataBadge.vue";
2525

2626
* [nutshell](https://withnutshell.com) - Private AI meeting notes processed completely on your device
2727

28+
* [Manzoni](https://manzoni.app/) ([GitHub](https://github.com/gems-platforms/manzoni-app)) - a text editor running local LLMs
29+
2830

2931

3032
<br />

llama/addon/AddonContext.cpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,69 @@ Napi::Value AddonContext::SetLoras(const Napi::CallbackInfo& info) {
965965
return info.Env().Undefined();
966966
}
967967

968+
class RestoreCheckpointWorker : public Napi::AsyncWorker {
969+
public:
970+
AddonContext* context;
971+
AddonContextSequenceCheckpoint* checkpoint;
972+
std::size_t maxPosIndex;
973+
bool restoreSuccess = false;
974+
975+
RestoreCheckpointWorker(const Napi::CallbackInfo& info, AddonContext* context, AddonContextSequenceCheckpoint* checkpoint, std::size_t maxPosIndex)
976+
: Napi::AsyncWorker(info.Env(), "RestoreCheckpointWorker"),
977+
context(context),
978+
checkpoint(checkpoint),
979+
maxPosIndex(maxPosIndex),
980+
deferred(Napi::Promise::Deferred::New(info.Env())) {
981+
context->Ref();
982+
checkpoint->Ref();
983+
}
984+
~RestoreCheckpointWorker() {
985+
context->Unref();
986+
checkpoint->Unref();
987+
}
988+
989+
Napi::Promise GetPromise() {
990+
return deferred.Promise();
991+
}
992+
993+
protected:
994+
Napi::Promise::Deferred deferred;
995+
996+
void Execute() {
997+
try {
998+
std::lock_guard<std::mutex> lock(checkpoint->dataMutex);
999+
1000+
std::size_t dataSize = checkpoint->data.size();
1001+
std::size_t restoreSize = llama_state_seq_set_data_ext(context->ctx, checkpoint->data.data(), dataSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
1002+
if (restoreSize == dataSize) {
1003+
restoreSuccess = (
1004+
llama_memory_seq_rm(llama_get_memory(context->ctx), checkpoint->sequenceId, maxPosIndex + 1, -1) &&
1005+
llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId) == maxPosIndex
1006+
);
1007+
}
1008+
} catch (const std::exception& e) {
1009+
SetError(e.what());
1010+
} catch(...) {
1011+
SetError("Unknown error when calling \"llama_state_seq_set_data_ext\"");
1012+
}
1013+
}
1014+
void OnOK() {
1015+
deferred.Resolve(Napi::Boolean::New(Env(), restoreSuccess));
1016+
}
1017+
void OnError(const Napi::Error& err) {
1018+
deferred.Reject(err.Value());
1019+
}
1020+
};
1021+
1022+
Napi::Value AddonContext::RestoreCheckpoint(const Napi::CallbackInfo& info) {
1023+
AddonContextSequenceCheckpoint* checkpoint = Napi::ObjectWrap<AddonContextSequenceCheckpoint>::Unwrap(info[0].As<Napi::Object>());
1024+
std::size_t maxPosIndex = info[1].As<Napi::Number>().Int32Value();
1025+
1026+
RestoreCheckpointWorker* worker = new RestoreCheckpointWorker(info, this, checkpoint, maxPosIndex);
1027+
worker->Queue();
1028+
return worker->GetPromise();
1029+
}
1030+
9681031
void AddonContext::init(Napi::Object exports) {
9691032
exports.Set(
9701033
"AddonContext",
@@ -992,8 +1055,113 @@ void AddonContext::init(Napi::Object exports) {
9921055
InstanceMethod("saveSequenceStateToFile", &AddonContext::SaveSequenceStateToFile),
9931056
InstanceMethod("loadSequenceStateFromFile", &AddonContext::LoadSequenceStateFromFile),
9941057
InstanceMethod("setLoras", &AddonContext::SetLoras),
1058+
InstanceMethod("restoreCheckpoint", &AddonContext::RestoreCheckpoint),
9951059
InstanceMethod("dispose", &AddonContext::Dispose),
9961060
}
9971061
)
9981062
);
9991063
}
1064+
1065+
AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {
1066+
1067+
}
1068+
AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
1069+
dispose();
1070+
}
1071+
1072+
class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
1073+
public:
1074+
AddonContextSequenceCheckpoint* checkpoint;
1075+
AddonContext* context;
1076+
1077+
AddonContextSequenceCheckpointInitWorker(const Napi::CallbackInfo& info, AddonContextSequenceCheckpoint* checkpoint, AddonContext* context)
1078+
: Napi::AsyncWorker(info.Env(), "AddonContextSequenceCheckpointInitWorker"),
1079+
checkpoint(checkpoint),
1080+
context(context),
1081+
deferred(Napi::Promise::Deferred::New(info.Env())) {
1082+
checkpoint->Ref();
1083+
context->Ref();
1084+
}
1085+
~AddonContextSequenceCheckpointInitWorker() {
1086+
checkpoint->Unref();
1087+
context->Unref();
1088+
}
1089+
1090+
Napi::Promise GetPromise() {
1091+
return deferred.Promise();
1092+
}
1093+
1094+
protected:
1095+
Napi::Promise::Deferred deferred;
1096+
1097+
void Execute() {
1098+
try {
1099+
checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId);
1100+
checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
1101+
const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
1102+
1103+
checkpoint->data.resize(checkpointSize, 0);
1104+
llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
1105+
} catch (const std::exception& e) {
1106+
SetError(e.what());
1107+
} catch(...) {
1108+
SetError("Unknown error when calling \"llama_state_seq_get_data_ext\"");
1109+
}
1110+
}
1111+
void OnOK() {
1112+
deferred.Resolve(Env().Undefined());
1113+
}
1114+
void OnError(const Napi::Error& err) {
1115+
deferred.Reject(err.Value());
1116+
}
1117+
};
1118+
1119+
Napi::Value AddonContextSequenceCheckpoint::Init(const Napi::CallbackInfo& info) {
1120+
AddonContext * context = Napi::ObjectWrap<AddonContext>::Unwrap(info[0].As<Napi::Object>());
1121+
sequenceId = info[1].As<Napi::Number>().Int32Value();
1122+
1123+
AddonContextSequenceCheckpointInitWorker* worker = new AddonContextSequenceCheckpointInitWorker(info, this, context);
1124+
worker->Queue();
1125+
return worker->GetPromise();
1126+
}
1127+
1128+
Napi::Value AddonContextSequenceCheckpoint::Dispose(const Napi::CallbackInfo& info) {
1129+
dispose();
1130+
return info.Env().Undefined();
1131+
}
1132+
1133+
void AddonContextSequenceCheckpoint::dispose() {
1134+
std::lock_guard<std::mutex> lock(dataMutex);
1135+
data.clear();
1136+
data.resize(0);
1137+
}
1138+
1139+
Napi::Value AddonContextSequenceCheckpoint::GetSize(const Napi::CallbackInfo& info) {
1140+
return Napi::Number::New(info.Env(), data.size());
1141+
}
1142+
1143+
Napi::Value AddonContextSequenceCheckpoint::GetMinPos(const Napi::CallbackInfo& info) {
1144+
return Napi::Number::New(info.Env(), minPos);
1145+
}
1146+
1147+
Napi::Value AddonContextSequenceCheckpoint::GetMaxPos(const Napi::CallbackInfo& info) {
1148+
return Napi::Number::New(info.Env(), maxPos);
1149+
}
1150+
1151+
void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
1152+
exports.Set(
1153+
"AddonContextSequenceCheckpoint",
1154+
DefineClass(
1155+
exports.Env(),
1156+
"AddonContextSequenceCheckpoint",
1157+
{
1158+
InstanceMethod("init", &AddonContextSequenceCheckpoint::Init),
1159+
InstanceMethod("dispose", &AddonContextSequenceCheckpoint::Dispose),
1160+
1161+
InstanceAccessor("size", &AddonContextSequenceCheckpoint::GetSize, nullptr),
1162+
InstanceAccessor("minPos", &AddonContextSequenceCheckpoint::GetMinPos, nullptr),
1163+
InstanceAccessor("maxPos", &AddonContextSequenceCheckpoint::GetMaxPos, nullptr),
1164+
}
1165+
)
1166+
);
1167+
}

llama/addon/AddonContext.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#pragma once
2+
3+
#include <mutex>
4+
25
#include "llama.h"
36
#include "napi.h"
47
#include "addonGlobals.h"
@@ -53,6 +56,30 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
5356
Napi::Value EnsureDraftContextIsCompatibleForSpeculative(const Napi::CallbackInfo& info);
5457

5558
Napi::Value SetLoras(const Napi::CallbackInfo& info);
59+
Napi::Value RestoreCheckpoint(const Napi::CallbackInfo& info);
60+
61+
static void init(Napi::Object exports);
62+
};
63+
64+
class AddonContextSequenceCheckpoint : public Napi::ObjectWrap<AddonContextSequenceCheckpoint> {
65+
public:
66+
std::mutex dataMutex;
67+
std::vector<uint8_t> data;
68+
llama_seq_id sequenceId = 0;
69+
std::size_t minPos = 0;
70+
std::size_t maxPos = 0;
71+
72+
AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info);
73+
~AddonContextSequenceCheckpoint();
74+
75+
Napi::Value Init(const Napi::CallbackInfo& info);
76+
Napi::Value Dispose(const Napi::CallbackInfo& info);
77+
78+
void dispose();
79+
80+
Napi::Value GetSize(const Napi::CallbackInfo& info);
81+
Napi::Value GetMinPos(const Napi::CallbackInfo& info);
82+
Napi::Value GetMaxPos(const Napi::CallbackInfo& info);
5683

5784
static void init(Napi::Object exports);
5885
};

llama/addon/addon.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
317317
AddonGrammar::init(exports);
318318
AddonGrammarEvaluationState::init(exports);
319319
AddonContext::init(exports);
320+
AddonContextSequenceCheckpoint::init(exports);
320321
AddonSampler::init(exports);
321322

322323
llama_log_set(addonLlamaCppLogCallback, nullptr);

0 commit comments

Comments
 (0)