Skip to content

Commit ab8104e

Browse files
Merge pull request #500 from janhq/update-dev-from-master-2026-04-29-01-04
Sync master with upstream release b8966
2 parents c159213 + 7b8443a commit ab8104e

181 files changed

Lines changed: 13197 additions & 9307 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

common/arg.cpp

Lines changed: 371 additions & 211 deletions
Large diffs are not rendered by default.

common/arg.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ struct common_arg {
2525
const char * value_hint_2 = nullptr; // for second arg value
2626
const char * env = nullptr;
2727
std::string help;
28-
bool is_sparam = false; // is current arg a sampling param?
28+
bool is_sampling = false; // is current arg a sampling param?
29+
bool is_spec = false; // is current arg a speculative decoding param?
2930
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
3031
void (*handler_void) (common_params & params) = nullptr;
3132
void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -74,7 +75,8 @@ struct common_arg {
7475
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
7576
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
7677
common_arg & set_env(const char * env);
77-
common_arg & set_sparam();
78+
common_arg & set_sampling();
79+
common_arg & set_spec();
7880
common_arg & set_preset_only();
7981
bool in_example(enum llama_example ex);
8082
bool is_exclude(enum llama_example ex);

common/common.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ common_time_meas::~common_time_meas() {
7070
// CPU utils
7171
//
7272

73-
int32_t cpu_get_num_physical_cores() {
73+
int32_t common_cpu_get_num_physical_cores() {
7474
#ifdef __linux__
7575
// enumerate the set of thread siblings, num entries is num cores
7676
std::unordered_set<std::string> siblings;
@@ -185,11 +185,11 @@ static int cpu_count_math_cpus(int n_cpu) {
185185
/**
186186
* Returns number of CPUs on system that are useful for math.
187187
*/
188-
int32_t cpu_get_num_math() {
188+
int32_t common_cpu_get_num_math() {
189189
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
190190
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
191191
if (n_cpu < 1) {
192-
return cpu_get_num_physical_cores();
192+
return common_cpu_get_num_physical_cores();
193193
}
194194
if (is_hybrid_cpu()) {
195195
cpu_set_t affinity;
@@ -202,7 +202,7 @@ int32_t cpu_get_num_math() {
202202
}
203203
}
204204
#endif
205-
return cpu_get_num_physical_cores();
205+
return common_cpu_get_num_physical_cores();
206206
}
207207

208208
// Helper for setting process priority
@@ -263,15 +263,15 @@ bool set_process_priority(enum ggml_sched_priority prio) {
263263
//
264264

265265

266-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
266+
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
267267
int32_t n_set = 0;
268268

269269
if (cpuparams.n_threads < 0) {
270270
// Assuming everything about cpuparams is invalid
271271
if (role_model != nullptr) {
272272
cpuparams = *role_model;
273273
} else {
274-
cpuparams.n_threads = cpu_get_num_math();
274+
cpuparams.n_threads = common_cpu_get_num_math();
275275
}
276276
}
277277

@@ -1521,7 +1521,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
15211521
return cparams;
15221522
}
15231523

1524-
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
1524+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
15251525
struct ggml_threadpool_params tpp;
15261526

15271527
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults

common/common.h

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
5454
// CPU utils
5555
//
5656

57-
struct cpu_params {
57+
struct common_cpu_params {
5858
int n_threads = -1;
5959
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
6060
bool mask_valid = false; // Default: any CPU
@@ -63,8 +63,8 @@ struct cpu_params {
6363
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
6464
};
6565

66-
int32_t cpu_get_num_physical_cores();
67-
int32_t cpu_get_num_math();
66+
int32_t common_cpu_get_num_physical_cores();
67+
int32_t common_cpu_get_num_math();
6868

6969
//
7070
// Common params
@@ -297,60 +297,80 @@ struct common_params_model {
297297

298298
struct common_ngram_mod;
299299

300-
struct common_params_speculative {
301-
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
302-
303-
// general-purpose speculative decoding parameters
304-
305-
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
306-
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
307-
float p_split = 0.1f; // speculative decoding split probability
308-
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
309-
310-
// ngram-based speculative decoding
311-
312-
uint16_t ngram_size_n = 12; // ngram size for lookup
313-
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
314-
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
315-
316-
std::shared_ptr<common_ngram_mod> ngram_mod;
300+
// draft-model-based speculative decoding parameters
301+
struct common_params_speculative_draft {
302+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
303+
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
317304

318-
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
319-
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
305+
float p_split = 0.1f; // speculative decoding split probability
306+
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
320307

321-
// draft-model speculative decoding
308+
common_params_model mparams;
322309

323-
struct common_params_model mparams_dft;
310+
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
324311

325-
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
326-
327-
llama_context_params cparams_dft; // these are the parameters for the draft llama_context
312+
llama_context_params cparams; // these are the parameters for the draft llama_context
328313

329314
int32_t n_ctx = 0; // draft context size
330315
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
331316

332317
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
333318
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
334319

335-
struct cpu_params cpuparams;
336-
struct cpu_params cpuparams_batch;
320+
common_cpu_params cpuparams;
321+
common_cpu_params cpuparams_batch;
337322

338323
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
339324

340325
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
341326
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
327+
};
328+
329+
struct common_params_speculative_ngram_mod {
330+
int32_t n_match = 24;
331+
332+
int32_t n_max = 64;
333+
int32_t n_min = 48;
334+
335+
// shared instance of the ngram container for all speculative decoding contexts
336+
std::shared_ptr<common_ngram_mod> obj;
337+
};
338+
339+
struct common_params_speculative_ngram_map {
340+
uint16_t size_n = 12; // ngram size for lookup
341+
uint16_t size_m = 48; // mgram size for speculative tokens
342+
uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
343+
};
344+
345+
struct common_params_speculative_ngram_cache {
346+
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding
347+
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
348+
};
349+
350+
struct common_params_speculative {
351+
// TODO: become a vector in order to support "chains of speculators"
352+
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
353+
354+
common_params_speculative_draft draft;
355+
356+
common_params_speculative_ngram_mod ngram_mod;
357+
common_params_speculative_ngram_map ngram_simple;
358+
common_params_speculative_ngram_map ngram_map_k;
359+
common_params_speculative_ngram_map ngram_map_k4v;
360+
361+
common_params_speculative_ngram_cache ngram_cache;
342362

343363
bool has_dft() const {
344-
return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
364+
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
345365
}
346366
};
347367

348368
struct common_params_vocoder {
349369
struct common_params_model model;
350370

351-
std::string speaker_file = ""; // speaker file path // NOLINT
371+
std::string speaker_file; // speaker file path
352372

353-
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
373+
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
354374
};
355375

356376
struct common_params_diffusion {
@@ -433,8 +453,8 @@ struct common_params {
433453

434454
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
435455

436-
struct cpu_params cpuparams;
437-
struct cpu_params cpuparams_batch;
456+
common_cpu_params cpuparams;
457+
common_cpu_params cpuparams_batch;
438458

439459
ggml_backend_sched_eval_callback cb_eval = nullptr;
440460
void * cb_eval_user_data = nullptr;
@@ -678,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params);
678698

679699
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
680700
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
681-
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
701+
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
682702
bool set_process_priority(enum ggml_sched_priority prio);
683703

684704
//
@@ -846,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params);
846866

847867
struct llama_model_params common_model_params_to_llama ( common_params & params);
848868
struct llama_context_params common_context_params_to_llama(const common_params & params);
849-
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
869+
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
850870

851871
// clear LoRA adapters from context, then apply new list of adapters
852872
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

common/preset.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
4343
for (const auto & it : key_to_opt) {
4444
const std::string & key = it.first;
4545
const common_arg & opt = it.second;
46-
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
46+
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
4747
allowed_keys.insert(key);
4848
// also add variant keys (args without leading dashes and env vars)
4949
for (const auto & arg : opt.get_args()) {

common/reasoning-budget.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,20 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
122122
}
123123
break;
124124
case REASONING_BUDGET_DONE:
125+
// Re-arm on a new start tag: some models emit multiple <think> blocks
126+
// per response, and each should get a fresh budget window.
127+
if (ctx->start_matcher.advance(token)) {
128+
ctx->state = REASONING_BUDGET_COUNTING;
129+
ctx->remaining = ctx->budget;
130+
ctx->end_matcher.reset();
131+
LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
132+
133+
if (ctx->remaining <= 0) {
134+
ctx->state = REASONING_BUDGET_FORCING;
135+
ctx->force_pos = 0;
136+
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
137+
}
138+
}
125139
break;
126140
}
127141
}

0 commit comments

Comments
 (0)