Skip to content

Commit a0a46d5

Browse files
committed
fix: bugs
1 parent 30d5e2d commit a0a46d5

9 files changed

Lines changed: 108 additions & 62 deletions

File tree

llama/addon/AddonContext.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,14 +446,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
446446
if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
447447
auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
448448
if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
449-
context_params.type_k = keyType;
449+
context_params.type_k = static_cast<ggml_type>(keyType);
450450
}
451451
}
452452

453453
if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
454454
auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
455455
if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
456-
context_params.type_v = valueType;
456+
context_params.type_v = static_cast<ggml_type>(valueType);
457457
}
458458
}
459459

src/cli/commands/ChatCommand.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -180,18 +180,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
180180
type: "string",
181181
choices: [
182182
"currentQuant",
183-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
183+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
184184
] as const,
185-
description: "The type of the key for the context KV cache tensors"
185+
default: "F16" as const,
186+
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
186187
})
187188
.option("kvCacheValueType", {
188189
alias: "kvcvt",
189190
type: "string",
190191
choices: [
191192
"currentQuant",
192-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
193+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
193194
] as const,
194-
description: "The type of the value for the context KV cache tensors"
195+
default: "F16" as const,
196+
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
195197
})
196198
.option("swaFullCache", {
197199
alias: "noSwa",
@@ -520,8 +522,8 @@ async function RunChat({
520522
? {fitContext: {contextSize}}
521523
: undefined,
522524
defaultContextFlashAttention: flashAttention,
523-
defaultContextKvCacheKeyType: kvCacheKeyType,
524-
defaultContextKvCacheValueType: kvCacheValueType,
525+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
526+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
525527
defaultContextSwaFullCache: swaFullCache,
526528
useMmap,
527529
useDirectIo,
@@ -557,8 +559,8 @@ async function RunChat({
557559
return await llama.loadModel({
558560
modelPath: resolvedDraftModelPath,
559561
defaultContextFlashAttention: flashAttention,
560-
defaultContextKvCacheKeyType: kvCacheKeyType,
561-
defaultContextKvCacheValueType: kvCacheValueType,
562+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
563+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
562564
defaultContextSwaFullCache: swaFullCache,
563565
useMmap,
564566
useDirectIo,

src/cli/commands/CompleteCommand.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
137137
type: "string",
138138
choices: [
139139
"currentQuant",
140-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
140+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
141141
] as const,
142-
description: "The type of the key for the context KV cache tensors"
142+
default: "F16" as const,
143+
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
143144
})
144145
.option("kvCacheValueType", {
145146
alias: "kvcvt",
146147
type: "string",
147148
choices: [
148149
"currentQuant",
149-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
150+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
150151
] as const,
151-
description: "The type of the value for the context KV cache tensors"
152+
default: "F16" as const,
153+
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
152154
})
153155
.option("swaFullCache", {
154156
alias: "noSwa",
@@ -427,8 +429,8 @@ async function RunCompletion({
427429
? {fitContext: {contextSize}}
428430
: undefined,
429431
defaultContextFlashAttention: flashAttention,
430-
defaultContextKvCacheKeyType: kvCacheKeyType,
431-
defaultContextKvCacheValueType: kvCacheValueType,
432+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
433+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
432434
defaultContextSwaFullCache: swaFullCache,
433435
useMmap,
434436
useDirectIo,
@@ -464,8 +466,8 @@ async function RunCompletion({
464466
return await llama.loadModel({
465467
modelPath: resolvedDraftModelPath,
466468
defaultContextFlashAttention: flashAttention,
467-
defaultContextKvCacheKeyType: kvCacheKeyType,
468-
defaultContextKvCacheValueType: kvCacheValueType,
469+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
470+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
469471
defaultContextSwaFullCache: swaFullCache,
470472
useMmap,
471473
useDirectIo,

src/cli/commands/InfillCommand.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -147,18 +147,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
147147
type: "string",
148148
choices: [
149149
"currentQuant",
150-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
150+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
151151
] as const,
152-
description: "The type of the key for the context KV cache tensors"
152+
default: "F16" as const,
153+
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
153154
})
154155
.option("kvCacheValueType", {
155156
alias: "kvcvt",
156157
type: "string",
157158
choices: [
158159
"currentQuant",
159-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
160+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
160161
] as const,
161-
description: "The type of the value for the context KV cache tensors"
162+
default: "F16" as const,
163+
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
162164
})
163165
.option("swaFullCache", {
164166
alias: "noSwa",
@@ -450,8 +452,8 @@ async function RunInfill({
450452
? {fitContext: {contextSize}}
451453
: undefined,
452454
defaultContextFlashAttention: flashAttention,
453-
defaultContextKvCacheKeyType: kvCacheKeyType,
454-
defaultContextKvCacheValueType: kvCacheValueType,
455+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
456+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
455457
defaultContextSwaFullCache: swaFullCache,
456458
useMmap,
457459
useDirectIo,
@@ -487,8 +489,8 @@ async function RunInfill({
487489
return await llama.loadModel({
488490
modelPath: resolvedDraftModelPath,
489491
defaultContextFlashAttention: flashAttention,
490-
defaultContextKvCacheKeyType: kvCacheKeyType,
491-
defaultContextKvCacheValueType: kvCacheValueType,
492+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
493+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
492494
defaultContextSwaFullCache: swaFullCache,
493495
useMmap,
494496
useDirectIo,

src/cli/commands/inspect/commands/InspectEstimateCommand.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,18 +125,20 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
125125
type: "string",
126126
choices: [
127127
"currentQuant",
128-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
128+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
129129
] as const,
130-
description: "The type of the key for the context KV cache tensors"
130+
default: "F16" as const,
131+
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
131132
})
132133
.option("kvCacheValueType", {
133134
alias: "kvcvt",
134135
type: "string",
135136
choices: [
136137
"currentQuant",
137-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
138+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
138139
] as const,
139-
description: "The type of the value for the context KV cache tensors"
140+
default: "F16" as const,
141+
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
140142
})
141143
.option("swaFullCache", {
142144
alias: "noSwa",

src/cli/commands/inspect/commands/InspectMeasureCommand.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,20 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
117117
type: "string",
118118
choices: [
119119
"currentQuant",
120-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
120+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
121121
] as const,
122-
description: "The type of the key for the context KV cache tensors"
122+
default: "F16" as const,
123+
description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
123124
})
124125
.option("kvCacheValueType", {
125126
alias: "kvcvt",
126127
type: "string",
127128
choices: [
128129
"currentQuant",
129-
...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
130+
...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
130131
] as const,
131-
description: "The type of the value for the context KV cache tensors"
132+
default: "F16" as const,
133+
description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
132134
})
133135
.option("swaFullCache", {
134136
alias: "noSwa",
@@ -833,8 +835,8 @@ async function runTestWorkerLogic() {
833835
),
834836
ignoreMemorySafetyChecks: currentContextSizeCheck != null,
835837
flashAttention,
836-
kvCacheKeyType,
837-
kvCacheValueType,
838+
experimentalKvCacheKeyType: kvCacheKeyType,
839+
experimentalKvCacheValueType: kvCacheValueType,
838840
swaFullCache,
839841
batchSize,
840842
failedCreationRemedy: false
@@ -907,8 +909,8 @@ async function runTestWorkerLogic() {
907909
useDirectIo,
908910
gpuLayers,
909911
defaultContextFlashAttention: flashAttention,
910-
defaultContextKvCacheKeyType: kvCacheKeyType,
911-
defaultContextKvCacheValueType: kvCacheValueType,
912+
experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
913+
experimentalDefaultContextKvCacheValueType: kvCacheValueType,
912914
defaultContextSwaFullCache: swaFullCache,
913915
ignoreMemorySafetyChecks: true
914916
});

src/evaluator/LlamaContext/LlamaContext.ts

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,17 @@ export class LlamaContext {
113113
} = {},
114114
swaFullCache = _model.defaultContextSwaFullCache,
115115
performanceTracking = false,
116-
kvCacheKeyType,
117-
kvCacheValueType,
116+
experimentalKvCacheKeyType,
117+
experimentalKvCacheValueType,
118118
_embeddings,
119119
_ranking
120120
}: LlamaContextOptions & {
121121
sequences: number,
122122
contextSize: number,
123123
batchSize: number,
124124
flashAttention: boolean,
125-
kvCacheKeyType: GgmlType,
126-
kvCacheValueType: GgmlType
125+
experimentalKvCacheKeyType: GgmlType,
126+
experimentalKvCacheValueType: GgmlType
127127
}) {
128128
if (_model.disposed)
129129
throw new DisposedError();
@@ -152,8 +152,8 @@ export class LlamaContext {
152152
: this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
153153
);
154154
this._performanceTracking = !!performanceTracking;
155-
this._kvCacheKeyType = kvCacheKeyType;
156-
this._kvCacheValueType = kvCacheValueType;
155+
this._kvCacheKeyType = experimentalKvCacheKeyType;
156+
this._kvCacheValueType = experimentalKvCacheValueType;
157157
this._swaFullCache = !!swaFullCache;
158158
this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
159159
contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
@@ -891,12 +891,12 @@ export class LlamaContext {
891891
const flashAttention = _model.flashAttentionSupported
892892
? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
893893
: false;
894-
const kvCacheKeyType = options.kvCacheKeyType === "currentQuant"
894+
const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant"
895895
? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
896-
: resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
897-
const kvCacheValueType = options.kvCacheValueType === "currentQuant"
896+
: resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
897+
const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant"
898898
? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
899-
: resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType;
899+
: resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType;
900900
const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
901901
const loraOptions = typeof options.lora === "string"
902902
? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"]
@@ -953,7 +953,14 @@ export class LlamaContext {
953953
});
954954

955955
const context = new LlamaContext({_model}, {
956-
...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache
956+
...options,
957+
contextSize,
958+
batchSize,
959+
sequences,
960+
flashAttention,
961+
experimentalKvCacheKeyType: kvCacheKeyType,
962+
experimentalKvCacheValueType: kvCacheValueType,
963+
swaFullCache
957964
});
958965
const contextCreationVramReservation = options.ignoreMemorySafetyChecks
959966
? null

src/evaluator/LlamaContext/types.ts

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,19 +111,31 @@ export type LlamaContextOptions = {
111111
* Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
112112
*
113113
* Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`).
114-
* @experimental - this option is experimental. it may not work as intended, and may change in the future
114+
* @deprecated - this option is experimental and highly unstable.
115+
* Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
116+
* Avoid allowing end users to configure this option, as it's highly unstable.
117+
* @experimental - this option is experimental and highly unstable.
118+
* It may not work as intended or even crash the process.
119+
* Use with caution.
120+
* This option may change or get removed in the future without a breaking change version.
115121
*/
116-
kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
122+
experimentalKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
117123

118124
/**
119125
* The type of the value for the KV cache tensors used in this context.
120126
*
121127
* Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
122128
*
123129
* Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`).
124-
* @experimental - this option is experimental. it may not work as intended, and may change in the future
130+
* @deprecated - this option is experimental and highly unstable.
131+
* Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
132+
* Avoid allowing end users to configure this option, as it's highly unstable.
133+
* @experimental - this option is experimental and highly unstable.
134+
* It may not work as intended or even crash the process.
135+
* Use with caution.
136+
* This option may change or get removed in the future without a breaking change version.
125137
*/
126-
kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
138+
experimentalKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
127139

128140
/**
129141
* When using SWA (Sliding Window Attention) on a supported model,

0 commit comments

Comments
 (0)