66#include " json-schema-to-grammar.h"
77#include " log.h"
88#include " sampling.h"
9+ #include " speculative.h"
910#include " preset.h"
1011
1112// fix problem with std::min and std::max
@@ -579,14 +580,14 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
579580 params.mmproj = res.mmproj ;
580581 }
581582 // only download mmproj if the current example is using it
582- for (auto & ex : mmproj_examples) {
583+ for (const auto & ex : mmproj_examples) {
583584 if (ctx_arg.ex == ex) {
584585 common_params_handle_model (params.mmproj , params.hf_token , params.offline );
585586 break ;
586587 }
587588 }
588- common_params_handle_model (params.speculative .model , params.hf_token , params.offline );
589- common_params_handle_model (params.vocoder .model , params.hf_token , params.offline );
589+ common_params_handle_model (params.speculative .mparams_dft , params.hf_token , params.offline );
590+ common_params_handle_model (params.vocoder .model , params.hf_token , params.offline );
590591 }
591592
592593 // model is required (except for server)
@@ -1216,16 +1217,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
12161217 {" -lcs" , " --lookup-cache-static" }, " FNAME" ,
12171218 " path to static lookup cache to use for lookup decoding (not updated by generation)" ,
12181219 [](common_params & params, const std::string & value) {
1219- params.lookup_cache_static = value;
1220+ params.speculative . lookup_cache_static = value;
12201221 }
1221- ).set_examples ({LLAMA_EXAMPLE_LOOKUP}));
1222+ ).set_examples ({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER }));
12221223 add_opt (common_arg (
12231224 {" -lcd" , " --lookup-cache-dynamic" }, " FNAME" ,
12241225 " path to dynamic lookup cache to use for lookup decoding (updated by generation)" ,
12251226 [](common_params & params, const std::string & value) {
1226- params.lookup_cache_dynamic = value;
1227+ params.speculative . lookup_cache_dynamic = value;
12271228 }
1228- ).set_examples ({LLAMA_EXAMPLE_LOOKUP}));
1229+ ).set_examples ({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER }));
12291230 add_opt (common_arg (
12301231 {" -c" , " --ctx-size" }, " N" ,
12311232 string_format (" size of the prompt context (default: %d, 0 = loaded from model)" , params.n_ctx ),
@@ -2563,7 +2564,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25632564 {" -hfd" , " -hfrd" , " --hf-repo-draft" }, " <user>/<model>[:quant]" ,
25642565 " Same as --hf-repo, but for the draft model (default: unused)" ,
25652566 [](common_params & params, const std::string & value) {
2566- params.speculative .model .hf_repo = value;
2567+ params.speculative .mparams_dft .hf_repo = value;
25672568 }
25682569 ).set_env (" LLAMA_ARG_HFD_REPO" ));
25692570 add_opt (common_arg (
@@ -3384,7 +3385,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33843385 {" -md" , " --model-draft" }, " FNAME" ,
33853386 " draft model for speculative decoding (default: unused)" ,
33863387 [](common_params & params, const std::string & value) {
3387- params.speculative .model .path = value;
3388+ params.speculative .mparams_dft .path = value;
33883389 }
33893390 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env (" LLAMA_ARG_MODEL_DRAFT" ));
33903391 add_opt (common_arg (
@@ -3394,6 +3395,66 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
33943395 params.speculative .replacements .push_back ({ tgt, dft });
33953396 }
33963397 ).set_examples ({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3398+ add_opt (common_arg (
3399+ {" --spec-type" }, " [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]" ,
3400+ string_format (" type of speculative decoding to use when no draft model is provided (default: %s)\n " ,
3401+ common_speculative_type_to_str (params.speculative .type ).c_str ()),
3402+ [](common_params & params, const std::string & value) {
3403+ if (value == " none" ) {
3404+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NONE;
3405+ } else if (value == " ngram-cache" ) {
3406+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
3407+ } else if (value == " ngram-simple" ) {
3408+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
3409+ } else if (value == " ngram-map-k" ) {
3410+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
3411+ } else if (value == " ngram-map-k4v" ) {
3412+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
3413+ } else {
3414+ throw std::invalid_argument (" unknown speculative decoding type without draft model" );
3415+ }
3416+ }
3417+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
3418+ add_opt (common_arg (
3419+ {" --spec-ngram-size-n" }, " N" ,
3420+ string_format (" ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)" , params.speculative .ngram_size_n ),
3421+ [](common_params & params, int value) {
3422+ if (value < 1 || value > 1024 ) {
3423+ throw std::invalid_argument (" ngram size N must be between 1 and 1024 inclusive" );
3424+ }
3425+ params.speculative .ngram_size_n = value;
3426+ }
3427+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
3428+ add_opt (common_arg (
3429+ {" --spec-ngram-size-m" }, " N" ,
3430+ string_format (" ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)" , params.speculative .ngram_size_m ),
3431+ [](common_params & params, int value) {
3432+ if (value < 1 || value > 1024 ) {
3433+ throw std::invalid_argument (" ngram size M must be between 1 and 1024 inclusive" );
3434+ }
3435+ params.speculative .ngram_size_m = value;
3436+ }
3437+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
3438+ add_opt (common_arg (
3439+ {" --spec-ngram-check-rate" }, " N" ,
3440+ string_format (" ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)" , params.speculative .ngram_check_rate ),
3441+ [](common_params & params, int value) {
3442+ if (value < 1 ) {
3443+ throw std::invalid_argument (" ngram check rate must be at least 1" );
3444+ }
3445+ params.speculative .ngram_check_rate = value;
3446+ }
3447+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
3448+ add_opt (common_arg (
3449+ {" --spec-ngram-min-hits" }, " N" ,
3450+ string_format (" minimum hits for ngram-map speculative decoding (default: %d)" , params.speculative .ngram_min_hits ),
3451+ [](common_params & params, int value) {
3452+ if (value < 1 ) {
3453+ throw std::invalid_argument (" ngram min hits must be at least 1" );
3454+ }
3455+ params.speculative .ngram_min_hits = value;
3456+ }
3457+ ).set_examples ({LLAMA_EXAMPLE_SERVER}));
33973458 add_opt (common_arg (
33983459 {" -ctkd" , " --cache-type-k-draft" }, " TYPE" ,
33993460 string_format (
@@ -3620,8 +3681,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
36203681 [](common_params & params) {
36213682 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" ;
36223683 params.model .hf_file = " qwen2.5-coder-7b-q8_0.gguf" ;
3623- params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3624- params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3684+ params.speculative .mparams_dft .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3685+ params.speculative .mparams_dft .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
36253686 params.port = 8012 ;
36263687 params.n_ubatch = 1024 ;
36273688 params.n_batch = 1024 ;
@@ -3636,8 +3697,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
36363697 [](common_params & params) {
36373698 params.model .hf_repo = " ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF" ;
36383699 params.model .hf_file = " qwen2.5-coder-14b-q8_0.gguf" ;
3639- params.speculative .model .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3640- params.speculative .model .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
3700+ params.speculative .mparams_dft .hf_repo = " ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF" ;
3701+ params.speculative .mparams_dft .hf_file = " qwen2.5-coder-0.5b-q8_0.gguf" ;
36413702 params.port = 8012 ;
36423703 params.n_ubatch = 1024 ;
36433704 params.n_batch = 1024 ;
0 commit comments