fix: bugs

giladgd · giladgd · commit a0a46d5a0a6c · 2026-03-17T07:50:49.000+02:00
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -446,14 +446,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
             auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
             if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
-                context_params.type_k = keyType;
+                context_params.type_k = static_cast<ggml_type>(keyType);
             }
         }
 
         if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
             auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
             if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
-                context_params.type_v = valueType;
+                context_params.type_v = static_cast<ggml_type>(valueType);
             }
         }
 
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -180,18 +180,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -520,8 +522,8 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -557,8 +559,8 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
@@ -137,18 +137,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -427,8 +429,8 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -464,8 +466,8 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
@@ -147,18 +147,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -450,8 +452,8 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -487,8 +489,8 @@ async function RunInfill({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
-                    defaultContextKvCacheKeyType: kvCacheKeyType,
-                    defaultContextKvCacheValueType: kvCacheValueType,
+                    experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                    experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -125,18 +125,20 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -117,18 +117,20 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the key for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("kvCacheValueType", {
                 alias: "kvcvt",
                 type: "string",
                 choices: [
                     "currentQuant",
-                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                    ...Object.keys(GgmlType).filter((key) => !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[]
                 ] as const,
-                description: "The type of the value for the context KV cache tensors"
+                default: "F16" as const,
+                description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors"
             })
             .option("swaFullCache", {
                 alias: "noSwa",
@@ -833,8 +835,8 @@ async function runTestWorkerLogic() {
                     ),
                     ignoreMemorySafetyChecks: currentContextSizeCheck != null,
                     flashAttention,
-                    kvCacheKeyType,
-                    kvCacheValueType,
+                    experimentalKvCacheKeyType: kvCacheKeyType,
+                    experimentalKvCacheValueType: kvCacheValueType,
                     swaFullCache,
                     batchSize,
                     failedCreationRemedy: false
@@ -907,8 +909,8 @@ async function runTestWorkerLogic() {
                 useDirectIo,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
-                defaultContextKvCacheKeyType: kvCacheKeyType,
-                defaultContextKvCacheValueType: kvCacheValueType,
+                experimentalDefaultContextKvCacheKeyType: kvCacheKeyType,
+                experimentalDefaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 ignoreMemorySafetyChecks: true
             });
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -113,17 +113,17 @@ export class LlamaContext {
         } = {},
         swaFullCache = _model.defaultContextSwaFullCache,
         performanceTracking = false,
-        kvCacheKeyType,
-        kvCacheValueType,
+        experimentalKvCacheKeyType,
+        experimentalKvCacheValueType,
         _embeddings,
         _ranking
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
         batchSize: number,
         flashAttention: boolean,
-        kvCacheKeyType: GgmlType,
-        kvCacheValueType: GgmlType
+        experimentalKvCacheKeyType: GgmlType,
+        experimentalKvCacheValueType: GgmlType
     }) {
         if (_model.disposed)
             throw new DisposedError();
@@ -152,8 +152,8 @@ export class LlamaContext {
                 : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
         );
         this._performanceTracking = !!performanceTracking;
-        this._kvCacheKeyType = kvCacheKeyType;
-        this._kvCacheValueType = kvCacheValueType;
+        this._kvCacheKeyType = experimentalKvCacheKeyType;
+        this._kvCacheValueType = experimentalKvCacheValueType;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
@@ -891,12 +891,12 @@ export class LlamaContext {
         const flashAttention = _model.flashAttentionSupported
             ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
             : false;
-        const kvCacheKeyType = options.kvCacheKeyType === "currentQuant"
+        const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant"
             ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType
-            : resolveGgmlTypeOption(options.kvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
-        const kvCacheValueType = options.kvCacheValueType === "currentQuant"
+            : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType;
+        const kvCacheValueType = options.experimentalKvCacheValueType === "currentQuant"
             ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheValueType
-            : resolveGgmlTypeOption(options.kvCacheValueType) ?? _model.defaultContextKvCacheValueType;
+            : resolveGgmlTypeOption(options.experimentalKvCacheValueType) ?? _model.defaultContextKvCacheValueType;
         const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
         const loraOptions = typeof options.lora === "string"
             ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"]
@@ -953,7 +953,14 @@ export class LlamaContext {
             });
 
             const context = new LlamaContext({_model}, {
-                ...options, contextSize, batchSize, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache
+                ...options,
+                contextSize,
+                batchSize,
+                sequences,
+                flashAttention,
+                experimentalKvCacheKeyType: kvCacheKeyType,
+                experimentalKvCacheValueType: kvCacheValueType,
+                swaFullCache
             });
             const contextCreationVramReservation = options.ignoreMemorySafetyChecks
                 ? null
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -111,19 +111,31 @@ export type LlamaContextOptions = {
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16` (inherited from the model option `defaultContextKvCacheKeyType`).
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalKvCacheKeyType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * The type of the value for the KV cache tensors used in this context.
      *
      * Set to `"currentQuant"` to use the same type as the current quantization of the model weights tensors.
      *
      * Defaults to `F16` (inherited from the model option `defaultContextKvCacheValueType`).
-     * @experimental - this option is experimental. it may not work as intended, and may change in the future
+     * @deprecated - this option is experimental and highly unstable.
+     * Only use with a hard-coded model and on specific hardware that you verify where the type passed to this option works correctly.
+     * Avoid allowing end users to configure this option, as it's highly unstable.
+     * @experimental - this option is experimental and highly unstable.
+     * It may not work as intended or even crash the process.
+     * Use with caution.
+     * This option may change or get removed in the future without a breaking change version.
      */
-    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
+    experimentalKvCacheValueType?: "currentQuant" | keyof typeof GgmlType | GgmlType,
 
     /**
      * When using SWA (Sliding Window Attention) on a supported model,
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts

Original file line number	Diff line number	Diff line change
`@@ -446,14 +446,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad`
`446`	`446`	`if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {`
`447`	`447`	`auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();`
`448`	`448`	`if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {`
`449`		`- context_params.type_k = keyType;`
	`449`	`+ context_params.type_k = static_cast<ggml_type>(keyType);`
`450`	`450`	`}`
`451`	`451`	`}`
`452`	`452`
`453`	`453`	`if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {`
`454`	`454`	`auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();`
`455`	`455`	`if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {`
`456`		`- context_params.type_v = valueType;`
	`456`	`+ context_params.type_v = static_cast<ggml_type>(valueType);`
`457`	`457`	`}`
`458`	`458`	`}`
`459`	`459`