withcatai
diff --git a/‎llama/addon/AddonContext.cpp‎
Lines changed: 17 additions & 3 deletions b/‎llama/addon/AddonContext.cpp‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/bindings/AddonTypes.ts‎
Lines changed: 2 additions & 0 deletions b/‎src/bindings/AddonTypes.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 33 additions & 4 deletions b/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 33 additions & 4 deletions
diff --git a/‎src/cli/commands/CompleteCommand.ts‎
Lines changed: 35 additions & 4 deletions b/‎src/cli/commands/CompleteCommand.ts‎
Lines changed: 35 additions & 4 deletions
@@ -443,6 +443,20 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
         }
 
+        if (options.Has("kvCacheKeyType") && options.Get("kvCacheKeyType").IsNumber()) {
+            auto keyType = options.Get("kvCacheKeyType").As<Napi::Number>().Int32Value();
+            if (keyType >= 0 && keyType < GGML_TYPE_COUNT) {
+                context_params.type_k = keyType;
+            }
+        }
+
+        if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) {
+            auto valueType = options.Get("kvCacheValueType").As<Napi::Number>().Int32Value();
+            if (valueType >= 0 && valueType < GGML_TYPE_COUNT) {
+                context_params.type_v = valueType;
+            }
+        }
+
         if (options.Has("swaFullCache")) {
             context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
         }
@@ -1063,7 +1077,7 @@ void AddonContext::init(Napi::Object exports) {
 }
 
 AddonContextSequenceCheckpoint::AddonContextSequenceCheckpoint(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonContextSequenceCheckpoint>(info) {
-    
+
 }
 AddonContextSequenceCheckpoint::~AddonContextSequenceCheckpoint() {
     dispose();
@@ -1099,7 +1113,7 @@ class AddonContextSequenceCheckpointInitWorker : public Napi::AsyncWorker {
                 checkpoint->minPos = llama_memory_seq_pos_min(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 checkpoint->maxPos = llama_memory_seq_pos_max(llama_get_memory(context->ctx), checkpoint->sequenceId);
                 const size_t checkpointSize = llama_state_seq_get_size_ext(context->ctx, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                
+
                 checkpoint->data.resize(checkpointSize, 0);
                 llama_state_seq_get_data_ext(context->ctx, checkpoint->data.data(), checkpointSize, checkpoint->sequenceId, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
             } catch (const std::exception& e) {
@@ -1164,4 +1178,4 @@ void AddonContextSequenceCheckpoint::init(Napi::Object exports) {
             }
         )
     );
-}
+}
@@ -31,6 +31,8 @@ export type BindingModule = {
             ranking?: boolean,
             threads?: number,
             performanceTracking?: boolean,
+            kvCacheKeyType?: number,
+            kvCacheValueType?: number,
             swaFullCache?: boolean
         }): AddonContext
     },
 
@@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type ChatCommand = {
     modelPath?: string,
@@ -46,6 +47,8 @@ type ChatCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     noTrimWhitespace: boolean,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
@@ -172,6 +175,24 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -379,7 +400,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
-        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
+        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -390,8 +411,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
-                batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
-                temperature, minP, topK, topP, seed, xtc,
+                batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile,
+                threads, temperature, minP, topK, topP, seed, xtc,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -408,7 +429,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
-    contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
+    contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -444,12 +465,16 @@ async function RunChat({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
+        kvCacheKeyType,
+        kvCacheValueType,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
+            kvCacheKeyType,
+            kvCacheValueType,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -495,6 +520,8 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -530,6 +557,8 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,
 
@@ -23,6 +23,7 @@ import {documentationPageUrls} from "../../config.js";
 import {ConsoleInteraction, ConsoleInteractionKey} from "../utils/ConsoleInteraction.js";
 import {DraftSequenceTokenPredictor} from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js";
 import {ParsedXtcArg, parseXtcArg} from "../utils/parseXtcArg.js";
+import {GgmlType} from "../../gguf/types/GgufTensorInfoTypes.js";
 
 type CompleteCommand = {
     modelPath?: string,
@@ -34,6 +35,8 @@ type CompleteCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType,
+    kvCacheValueType?: "currentQuant" | keyof typeof GgmlType,
     swaFullCache?: boolean,
     threads?: number,
     temperature: number,
@@ -129,6 +132,24 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("kvCacheKeyType", {
+                alias: "kvckt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the key for the context KV cache tensors"
+            })
+            .option("kvCacheValueType", {
+                alias: "kvcvt",
+                type: "string",
+                choices: [
+                    "currentQuant",
+                    ...Object.keys(GgmlType).filter((key) => typeof key === "string") as (keyof typeof GgmlType)[]
+                ] as const,
+                description: "The type of the value for the context KV cache tensors"
+            })
             .option("swaFullCache", {
                 alias: "noSwa",
                 type: "boolean",
@@ -299,15 +320,16 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
-        flashAttention, swaFullCache, threads, temperature, minP, topK,
+        flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK,
         topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
         dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
         debug, numa, meter, timing, noMmap, useDirectIo, printTimings
     }) {
         try {
             await RunCompletion({
-                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+                kvCacheKeyType, kvCacheValueType, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength,
                 dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens,
@@ -323,7 +345,8 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
 
 
 async function RunCompletion({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+    kvCacheKeyType, kvCacheValueType, swaFullCache,
     threads, temperature, minP, topK, topP, seed, xtc, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
@@ -356,13 +379,17 @@ async function RunCompletion({
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
         swaFullCache,
-        useMmap
+        useMmap,
+        kvCacheKeyType,
+        kvCacheValueType
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
             swaFullCache,
             useMmap,
+            kvCacheKeyType,
+            kvCacheValueType,
             consoleTitle: "Draft model file"
         })
         : undefined;
@@ -400,6 +427,8 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextKvCacheKeyType: kvCacheKeyType,
+                defaultContextKvCacheValueType: kvCacheValueType,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 useDirectIo,
@@ -435,6 +464,8 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextKvCacheKeyType: kvCacheKeyType,
+                    defaultContextKvCacheValueType: kvCacheValueType,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     useDirectIo,