@@ -2426,6 +2426,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24262426 }
24272427 }
24282428 ).set_env (" LLAMA_ARG_FIT" ));
2429+ add_opt (common_arg (
2430+ { " -fitp" , " --fit-print" }, " [on|off]" ,
2431+ string_format (" print the estimated required memory ('on' or 'off', default: '%s')" , params.fit_params_print ? " on" : " off" ),
2432+ [](common_params & params, const std::string & value) {
2433+ if (is_truthy (value)) {
2434+ params.fit_params_print = true ;
2435+ } else if (is_falsey (value)) {
2436+ params.fit_params_print = false ;
2437+ } else {
2438+ throw std::runtime_error (
2439+ string_format (" error: unknown value for --fit-print: '%s'\n " , value.c_str ()));
2440+ }
2441+ }
2442+ ).set_examples ({LLAMA_EXAMPLE_FIT_PARAMS}).set_env (" LLAMA_ARG_FIT_ESTIMATE" ));
24292443 add_opt (common_arg (
24302444 { " -fitt" , " --fit-target" }, " MiB0,MiB1,MiB2,..." ,
24312445 string_format (" target margin per device for --fit, comma-separated list of values, "
@@ -3108,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31083122 " token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)" ,
31093123 [](common_params & params, int value) {
31103124 if (value < -1 ) { throw std::invalid_argument (" invalid value" ); }
3111- params.reasoning_budget = value;
3125+ params.sampling . reasoning_budget_tokens = value;
31123126 }
31133127 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env (" LLAMA_ARG_THINK_BUDGET" ));
31143128 add_opt (common_arg (
31153129 {" --reasoning-budget-message" }, " MESSAGE" ,
31163130 " message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)" ,
31173131 [](common_params & params, const std::string & value) {
3118- params.reasoning_budget_message = value;
3132+ params.sampling . reasoning_budget_message = value;
31193133 }
31203134 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env (" LLAMA_ARG_THINK_BUDGET_MESSAGE" ));
31213135 add_opt (common_arg (
@@ -3888,6 +3902,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
38883902 }
38893903 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
38903904
3905+ add_opt (common_arg (
3906+ {" --spec-default" },
3907+ string_format (" enable default speculative decoding config" ),
3908+ [](common_params & params) {
3909+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3910+ params.speculative .ngram_size_n = 24 ;
3911+ params.speculative .n_min = 48 ;
3912+ params.speculative .n_max = 64 ;
3913+ }
3914+ ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3915+
38913916 return ctx_arg;
38923917}
38933918
0 commit comments