@@ -292,7 +292,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
292292 hf_tag = " default" ;
293293 }
294294
295- std::string model_endpoint = get_model_endpoint ();
295+ std::string model_endpoint = common_get_model_endpoint ();
296296 auto preset_url = model_endpoint + hf_repo + " /resolve/main/preset.ini" ;
297297
298298 // prepare local path for caching
@@ -1339,13 +1339,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13391339 }
13401340 ).set_env (" LLAMA_ARG_KV_UNIFIED" ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
13411341 add_opt (common_arg (
1342- {" --clear -idle" },
1343- {" --no-clear -idle" },
1342+ {" --cache -idle-slots " },
1343+ {" --no-cache -idle-slots " },
13441344 " save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)" ,
13451345 [](common_params & params, bool value) {
1346- params.clear_idle = value;
1346+ params.cache_idle_slots = value;
13471347 }
1348- ).set_env (" LLAMA_ARG_CLEAR_IDLE " ).set_examples ({LLAMA_EXAMPLE_SERVER}));
1348+ ).set_env (" LLAMA_ARG_CACHE_IDLE_SLOTS " ).set_examples ({LLAMA_EXAMPLE_SERVER}));
13491349 add_opt (common_arg (
13501350 {" --context-shift" },
13511351 {" --no-context-shift" },
@@ -2449,6 +2449,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
24492449 }
24502450 }
24512451 ).set_env (" LLAMA_ARG_FIT" ));
2452+ add_opt (common_arg (
2453+ { " -fitp" , " --fit-print" }, " [on|off]" ,
2454+ string_format (" print the estimated required memory ('on' or 'off', default: '%s')" , params.fit_params_print ? " on" : " off" ),
2455+ [](common_params & params, const std::string & value) {
2456+ if (is_truthy (value)) {
2457+ params.fit_params_print = true ;
2458+ } else if (is_falsey (value)) {
2459+ params.fit_params_print = false ;
2460+ } else {
2461+ throw std::runtime_error (
2462+ string_format (" error: unknown value for --fit-print: '%s'\n " , value.c_str ()));
2463+ }
2464+ }
2465+ ).set_examples ({LLAMA_EXAMPLE_FIT_PARAMS}).set_env (" LLAMA_ARG_FIT_ESTIMATE" ));
24522466 add_opt (common_arg (
24532467 { " -fitt" , " --fit-target" }, " MiB0,MiB1,MiB2,..." ,
24542468 string_format (" target margin per device for --fit, comma-separated list of values, "
@@ -3131,14 +3145,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
31313145 " token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)" ,
31323146 [](common_params & params, int value) {
31333147 if (value < -1 ) { throw std::invalid_argument (" invalid value" ); }
3134- params.reasoning_budget = value;
3148+ params.sampling . reasoning_budget_tokens = value;
31353149 }
31363150 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env (" LLAMA_ARG_THINK_BUDGET" ));
31373151 add_opt (common_arg (
31383152 {" --reasoning-budget-message" }, " MESSAGE" ,
31393153 " message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)" ,
31403154 [](common_params & params, const std::string & value) {
3141- params.reasoning_budget_message = value;
3155+ params.sampling . reasoning_budget_message = value;
31423156 }
31433157 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env (" LLAMA_ARG_THINK_BUDGET_MESSAGE" ));
31443158 add_opt (common_arg (
@@ -3911,6 +3925,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
39113925 }
39123926 ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
39133927
3928+ add_opt (common_arg (
3929+ {" --spec-default" },
3930+ string_format (" enable default speculative decoding config" ),
3931+ [](common_params & params) {
3932+ params.speculative .type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3933+ params.speculative .ngram_size_n = 24 ;
3934+ params.speculative .n_min = 48 ;
3935+ params.speculative .n_max = 64 ;
3936+ }
3937+ ).set_examples ({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3938+
39143939 return ctx_arg;
39153940}
39163941
0 commit comments