@@ -31,6 +31,7 @@ import {withCliCommandDescriptionDocsUrl} from "../utils/withCliCommandDescripti
3131import { ConsoleInteraction , ConsoleInteractionKey } from "../utils/ConsoleInteraction.js" ;
3232import { DraftSequenceTokenPredictor } from "../../evaluator/LlamaContext/tokenPredictors/DraftSequenceTokenPredictor.js" ;
3333import { ParsedXtcArg , parseXtcArg } from "../utils/parseXtcArg.js" ;
34+ import { GgmlType } from "../../gguf/types/GgufTensorInfoTypes.js" ;
3435
3536type ChatCommand = {
3637 modelPath ?: string ,
@@ -46,6 +47,8 @@ type ChatCommand = {
4647 contextSize ?: number ,
4748 batchSize ?: number ,
4849 flashAttention ?: boolean ,
50+ kvCacheKeyType ?: "currentQuant" | keyof typeof GgmlType ,
51+ kvCacheValueType ?: "currentQuant" | keyof typeof GgmlType ,
4952 swaFullCache ?: boolean ,
5053 noTrimWhitespace : boolean ,
5154 grammar : "text" | Parameters < typeof LlamaGrammar . getFor > [ 1 ] ,
@@ -172,6 +175,24 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
172175 default : false ,
173176 description : "Enable flash attention"
174177 } )
178+ . option ( "kvCacheKeyType" , {
179+ alias : "kvckt" ,
180+ type : "string" ,
181+ choices : [
182+ "currentQuant" ,
183+ ...Object . keys ( GgmlType ) . filter ( ( key ) => typeof key === "string" ) as ( keyof typeof GgmlType ) [ ]
184+ ] as const ,
185+ description : "The type of the key for the context KV cache tensors"
186+ } )
187+ . option ( "kvCacheValueType" , {
188+ alias : "kvcvt" ,
189+ type : "string" ,
190+ choices : [
191+ "currentQuant" ,
192+ ...Object . keys ( GgmlType ) . filter ( ( key ) => typeof key === "string" ) as ( keyof typeof GgmlType ) [ ]
193+ ] as const ,
194+ description : "The type of the value for the context KV cache tensors"
195+ } )
175196 . option ( "swaFullCache" , {
176197 alias : "noSwa" ,
177198 type : "boolean" ,
@@ -379,7 +400,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
379400 } ,
380401 async handler ( {
381402 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
382- promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
403+ promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType , kvCacheValueType , swaFullCache,
383404 noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
384405 topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
385406 repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -390,8 +411,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
390411 try {
391412 await RunChat ( {
392413 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
393- batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads ,
394- temperature, minP, topK, topP, seed, xtc,
414+ batchSize, flashAttention, kvCacheKeyType , kvCacheValueType , swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile,
415+ threads , temperature, minP, topK, topP, seed, xtc,
395416 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
396417 dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens,
397418 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
@@ -408,7 +429,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
408429
409430async function RunChat ( {
410431 modelPath : modelArg , header : headerArg , gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
411- contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar : grammarArg ,
432+ contextSize, batchSize, kvCacheKeyType , kvCacheValueType , flashAttention, swaFullCache, noTrimWhitespace, grammar : grammarArg ,
412433 jsonSchemaGrammarFile : jsonSchemaGrammarFilePath ,
413434 threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
414435 repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength,
@@ -444,12 +465,16 @@ async function RunChat({
444465 const resolvedModelPath = await resolveCommandGgufPath ( modelArg , llama , headers , {
445466 flashAttention,
446467 swaFullCache,
468+ kvCacheKeyType,
469+ kvCacheValueType,
447470 useMmap
448471 } ) ;
449472 const resolvedDraftModelPath = ( tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "" )
450473 ? await resolveCommandGgufPath ( tokenPredictionDraftModel , llama , headers , {
451474 flashAttention,
452475 swaFullCache,
476+ kvCacheKeyType,
477+ kvCacheValueType,
453478 useMmap,
454479 consoleTitle : "Draft model file"
455480 } )
@@ -495,6 +520,8 @@ async function RunChat({
495520 ? { fitContext : { contextSize} }
496521 : undefined ,
497522 defaultContextFlashAttention : flashAttention ,
523+ defaultContextKvCacheKeyType : kvCacheKeyType ,
524+ defaultContextKvCacheValueType : kvCacheValueType ,
498525 defaultContextSwaFullCache : swaFullCache ,
499526 useMmap,
500527 useDirectIo,
@@ -530,6 +557,8 @@ async function RunChat({
530557 return await llama . loadModel ( {
531558 modelPath : resolvedDraftModelPath ,
532559 defaultContextFlashAttention : flashAttention ,
560+ defaultContextKvCacheKeyType : kvCacheKeyType ,
561+ defaultContextKvCacheValueType : kvCacheValueType ,
533562 defaultContextSwaFullCache : swaFullCache ,
534563 useMmap,
535564 useDirectIo,
0 commit comments