@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
5454// CPU utils
5555//
5656
57- struct cpu_params {
57+ struct common_cpu_params {
5858 int n_threads = -1 ;
5959 bool cpumask[GGML_MAX_N_THREADS] = {false }; // CPU affinity mask.
6060 bool mask_valid = false ; // Default: any CPU
@@ -63,8 +63,8 @@ struct cpu_params {
6363 uint32_t poll = 50 ; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
6464};
6565
66- int32_t cpu_get_num_physical_cores ();
67- int32_t cpu_get_num_math ();
66+ int32_t common_cpu_get_num_physical_cores ();
67+ int32_t common_cpu_get_num_math ();
6868
6969//
7070// Common params
@@ -297,60 +297,80 @@ struct common_params_model {
297297
298298struct common_ngram_mod ;
299299
300- struct common_params_speculative {
301- common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
302-
303- // general-purpose speculative decoding parameters
304-
305- int32_t n_max = 16 ; // maximum number of tokens to draft during speculative decoding
306- int32_t n_min = 0 ; // minimum number of draft tokens to use for speculative decoding
307- float p_split = 0 .1f ; // speculative decoding split probability
308- float p_min = 0 .75f ; // minimum speculative decoding probability (greedy)
309-
310- // ngram-based speculative decoding
311-
312- uint16_t ngram_size_n = 12 ; // ngram size for lookup
313- uint16_t ngram_size_m = 48 ; // mgram size for speculative tokens
314- uint16_t ngram_min_hits = 1 ; // minimum hits at ngram/mgram lookup for mgram to be proposed
315-
316- std::shared_ptr<common_ngram_mod> ngram_mod;
300+ // draft-model-based speculative decoding parameters
301+ struct common_params_speculative_draft {
302+ int32_t n_max = 16 ; // maximum number of tokens to draft during speculative decoding
303+ int32_t n_min = 0 ; // minimum number of draft tokens to use for speculative decoding
317304
318- std::string lookup_cache_static ; // path of static ngram cache file for lookup decoding // NOLINT
319- std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
305+ float p_split = 0 . 1f ; // speculative decoding split probability
306+ float p_min = 0 . 75f ; // minimum speculative decoding probability (greedy)
320307
321- // draft-model speculative decoding
308+ common_params_model mparams;
322309
323- struct common_params_model mparams_dft;
310+ llama_model * model = nullptr ; // a llama_model that can be shared by multiple speculative contexts
324311
325- llama_model * model_dft = nullptr ; // a llama_model that can be shared by multiple speculative contexts
326-
327- llama_context_params cparams_dft; // these are the parameters for the draft llama_context
312+ llama_context_params cparams; // these are the parameters for the draft llama_context
328313
329314 int32_t n_ctx = 0 ; // draft context size
330315 int32_t n_gpu_layers = -1 ; // number of layers to store in VRAM for the draft model (-1 - use default)
331316
332317 ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
333318 ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
334319
335- struct cpu_params cpuparams;
336- struct cpu_params cpuparams_batch;
320+ common_cpu_params cpuparams;
321+ common_cpu_params cpuparams_batch;
337322
338323 std::vector<ggml_backend_dev_t > devices; // devices to use for offloading
339324
340325 std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
341326 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
327+ };
328+
329+ struct common_params_speculative_ngram_mod {
330+ int32_t n_match = 24 ;
331+
332+ int32_t n_max = 64 ;
333+ int32_t n_min = 48 ;
334+
335+ // shared instance of the ngram container for all speculative decoding contexts
336+ std::shared_ptr<common_ngram_mod> obj;
337+ };
338+
339+ struct common_params_speculative_ngram_map {
340+ uint16_t size_n = 12 ; // ngram size for lookup
341+ uint16_t size_m = 48 ; // mgram size for speculative tokens
342+ uint16_t min_hits = 1 ; // minimum hits at ngram/mgram lookup for mgram to be proposed
343+ };
344+
345+ struct common_params_speculative_ngram_cache {
346+ std::string lookup_cache_static; // path of static ngram cache file for lookup decoding
347+ std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
348+ };
349+
350+ struct common_params_speculative {
351+ // TODO: become a vector in order to support "chains of speculators"
352+ common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
353+
354+ common_params_speculative_draft draft;
355+
356+ common_params_speculative_ngram_mod ngram_mod;
357+ common_params_speculative_ngram_map ngram_simple;
358+ common_params_speculative_ngram_map ngram_map_k;
359+ common_params_speculative_ngram_map ngram_map_k4v;
360+
361+ common_params_speculative_ngram_cache ngram_cache;
342362
343363 bool has_dft () const {
344- return !mparams_dft. path .empty () || !mparams_dft .hf_repo .empty ();
364+ return !draft. mparams . path .empty () || !draft. mparams .hf_repo .empty ();
345365 }
346366};
347367
348368struct common_params_vocoder {
349369 struct common_params_model model;
350370
351- std::string speaker_file = " " ; // speaker file path // NOLINT
371+ std::string speaker_file; // speaker file path
352372
353- bool use_guide_tokens = false ; // enable guide tokens to improve TTS accuracy // NOLINT
373+ bool use_guide_tokens = false ; // enable guide tokens to improve TTS accuracy
354374};
355375
356376struct common_params_diffusion {
@@ -433,8 +453,8 @@ struct common_params {
433453
434454 enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
435455
436- struct cpu_params cpuparams;
437- struct cpu_params cpuparams_batch;
456+ common_cpu_params cpuparams;
457+ common_cpu_params cpuparams_batch;
438458
439459 ggml_backend_sched_eval_callback cb_eval = nullptr ;
440460 void * cb_eval_user_data = nullptr ;
@@ -678,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params);
678698
679699bool parse_cpu_range (const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]);
680700bool parse_cpu_mask (const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]);
681- void postprocess_cpu_params (cpu_params & cpuparams, const cpu_params * role_model = nullptr );
701+ void postprocess_cpu_params (common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr );
682702bool set_process_priority (enum ggml_sched_priority prio);
683703
684704//
@@ -846,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params);
846866
847867struct llama_model_params common_model_params_to_llama ( common_params & params);
848868struct llama_context_params common_context_params_to_llama (const common_params & params);
849- struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params (const cpu_params & params);
869+ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params (const common_cpu_params & params);
850870
851871// clear LoRA adapters from context, then apply new list of adapters
852872void common_set_adapter_lora (struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
0 commit comments