Skip to content

Commit 9dbd161

Browse files
Merge pull request #501 from janhq/update-dev-from-master-2026-04-30-01-04
Sync master with upstream release b8981
2 parents ab8104e + d775992 commit 9dbd161

38 files changed

Lines changed: 1318 additions & 823 deletions

common/log.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ enum common_log_col : int {
4949
};
5050

5151
// disable colors by default
52-
static std::vector<const char *> g_col = {
52+
static const char* g_col[] = {
5353
"",
5454
"",
5555
"",
@@ -247,7 +247,6 @@ struct common_log {
247247

248248
entries = std::move(new_entries);
249249
}
250-
251250
cv.notify_one();
252251
}
253252

@@ -265,7 +264,6 @@ struct common_log {
265264
{
266265
std::unique_lock<std::mutex> lock(mtx);
267266
cv.wait(lock, [this]() { return head != tail; });
268-
269267
cur = entries[head];
270268

271269
head = (head + 1) % entries.size();
@@ -301,7 +299,6 @@ struct common_log {
301299

302300
tail = (tail + 1) % entries.size();
303301
}
304-
305302
cv.notify_one();
306303
}
307304

@@ -338,7 +335,7 @@ struct common_log {
338335
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
339336
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
340337
} else {
341-
for (size_t i = 0; i < g_col.size(); i++) {
338+
for (size_t i = 0; i < std::size(g_col); i++) {
342339
g_col[i] = "";
343340
}
344341
}
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
368365
}
369366

370367
struct common_log * common_log_main() {
371-
static struct common_log log;
368+
// We intentionally leak (i.e. do not delete) the logger singleton because
369+
// common_log destructor called at DLL teardown phase will cause hanging on Windows.
370+
// OS will release resources anyway so it should not be a significant issue,
371+
// though this design may cause logs to be lost if not flushed before the program exits.
372+
// Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
373+
static struct common_log * log;
372374
static std::once_flag init_flag;
373375
std::call_once(init_flag, [&]() {
376+
log = new common_log;
374377
// Set default to auto-detect colors
375-
log.set_colors(tty_can_use_colors());
378+
log->set_colors(tty_can_use_colors());
376379
});
377380

378-
return &log;
381+
return log;
379382
}
380383

381384
void common_log_pause(struct common_log * log) {

common/log.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
4949
struct common_log;
5050

5151
struct common_log * common_log_init();
52-
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
52+
53+
// Singleton, intentionally leaked to avoid Windows teardown hangs.
54+
// Call common_log_flush() before exit if you want to ensure all logs are flushed.
55+
struct common_log * common_log_main();
56+
5357
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
5458
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
5559
void common_log_free (struct common_log * log);

common/reasoning-budget.cpp

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -232,34 +232,6 @@ static struct llama_sampler * common_reasoning_budget_init_state(
232232
);
233233
}
234234

235-
struct llama_sampler * common_reasoning_budget_init(
236-
const struct llama_vocab * vocab,
237-
const std::vector<llama_token> & start_tokens,
238-
const std::vector<llama_token> & end_tokens,
239-
const std::vector<llama_token> & forced_tokens,
240-
int32_t budget,
241-
const std::vector<llama_token> & prefill_tokens) {
242-
// Determine initial state from prefill: COUNTING if the prefill begins with
243-
// the start sequence but does not also contain the end sequence after it.
244-
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
245-
if (!prefill_tokens.empty() && !start_tokens.empty() &&
246-
prefill_tokens.size() >= start_tokens.size() &&
247-
std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
248-
initial_state = REASONING_BUDGET_COUNTING;
249-
// If the end sequence also follows the start in the prefill, reasoning
250-
// was opened and immediately closed — stay IDLE.
251-
if (!end_tokens.empty() &&
252-
prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
253-
auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
254-
if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
255-
std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
256-
initial_state = REASONING_BUDGET_IDLE;
257-
}
258-
}
259-
}
260-
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
261-
}
262-
263235
struct llama_sampler * common_reasoning_budget_init(
264236
const struct llama_vocab * vocab,
265237
const std::vector<llama_token> & start_tokens,

common/reasoning-budget.h

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,27 +29,14 @@ enum common_reasoning_budget_state {
2929
// end_tokens - token sequence for natural deactivation
3030
// forced_tokens - token sequence forced when budget expires
3131
// budget - max tokens allowed in the reasoning block
32-
// prefill_tokens - tokens already present in the prompt (generation prompt);
33-
// used to determine the initial state: COUNTING if they begin
34-
// with start_tokens (but don't also end with end_tokens),
35-
// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
32+
// initial_state - initial state
3633
//
3734
struct llama_sampler * common_reasoning_budget_init(
3835
const struct llama_vocab * vocab,
3936
const std::vector<llama_token> & start_tokens,
4037
const std::vector<llama_token> & end_tokens,
4138
const std::vector<llama_token> & forced_tokens,
4239
int32_t budget,
43-
const std::vector<llama_token> & prefill_tokens = {});
44-
45-
// Variant that takes an explicit initial state (used by tests and clone).
46-
// COUNTING with budget <= 0 is promoted to FORCING.
47-
struct llama_sampler * common_reasoning_budget_init(
48-
const struct llama_vocab * vocab,
49-
const std::vector<llama_token> & start_tokens,
50-
const std::vector<llama_token> & end_tokens,
51-
const std::vector<llama_token> & forced_tokens,
52-
int32_t budget,
53-
common_reasoning_budget_state initial_state);
40+
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE);
5441

5542
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);

common/sampling.cpp

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -260,32 +260,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
260260
}
261261
}
262262

263-
// Feed generation prompt tokens to the grammar sampler so it advances past
264-
// tokens the template already placed in the prompt.
265-
// Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
263+
// Compute prefill tokens from the generation prompt
266264
std::vector<llama_token> prefill_tokens;
267-
if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
265+
if (!params.generation_prompt.empty()) {
268266
GGML_ASSERT(vocab != nullptr);
269-
prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
270-
if (!prefill_tokens.empty()) {
271-
std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
272-
if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
273-
// Some tokenizers will add a space before the first special token, need to remove
274-
prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
267+
auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
268+
for (size_t i = 0; i < tokens.size(); i++) {
269+
std::string piece = common_token_to_piece(vocab, tokens[i], true);
270+
if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
271+
// Some tokenizers will add a space before the first special token, need to exclude
272+
continue;
275273
}
274+
LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
275+
prefill_tokens.push_back(tokens[i]);
276276
}
277+
}
277278

278-
if (grmr && !params.grammar_lazy) {
279-
try {
280-
for (const auto & token : prefill_tokens) {
281-
llama_sampler_accept(grmr, token);
282-
LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
283-
}
284-
} catch (std::exception &e) {
285-
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
286-
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
287-
throw e;
279+
// Feed generation prompt tokens to the grammar sampler so it advances past
280+
// tokens the template already placed in the prompt.
281+
// Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
282+
if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
283+
try {
284+
for (const auto & token : prefill_tokens) {
285+
llama_sampler_accept(grmr, token);
286+
LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
288287
}
288+
} catch (std::exception &e) {
289+
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
290+
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
291+
throw e;
289292
}
290293
}
291294

@@ -296,8 +299,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
296299
params.reasoning_budget_start,
297300
params.reasoning_budget_end,
298301
params.reasoning_budget_forced,
299-
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
300-
prefill_tokens);
302+
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
303+
304+
for (const auto & token : prefill_tokens) {
305+
llama_sampler_accept(rbudget, token);
306+
LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
307+
}
301308
}
302309

303310
if (params.has_logit_bias()) {
@@ -431,17 +438,19 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
431438
return true;
432439
}
433440

434-
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
441+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
435442
if (!gsmpl) {
436443
return;
437444
}
438445

439446
const auto tm = gsmpl->tm();
440447

441448
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
442-
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
449+
const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
443450

444-
llama_sampler_accept(gsmpl->rbudget, token);
451+
if (gsmpl->rbudget && is_generated) {
452+
llama_sampler_accept(gsmpl->rbudget, token);
453+
}
445454

446455
if (gsmpl->grmr && accept_grammar) {
447456
llama_sampler_accept(gsmpl->grmr, token);

common/sampling.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
4141

4242
void common_sampler_free(struct common_sampler * gsmpl);
4343

44-
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
45-
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
44+
// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
45+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
4646
void common_sampler_reset (struct common_sampler * gsmpl);
4747
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
4848

common/speculative.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ struct common_speculative_state_draft : public common_speculative_state {
467467

468468
prompt_dft.push_back(id_last);
469469

470-
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
470+
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
471471

472472
int ret = llama_decode(ctx_dft, batch);
473473
if (ret != 0 && ret != 1) {
@@ -495,14 +495,14 @@ struct common_speculative_state_draft : public common_speculative_state {
495495

496496
common_sampler_accept(smpl, id, true);
497497

498-
result.push_back(id);
499-
500-
if (sparams.n_max <= (int) result.size()) {
498+
// only collect very high-confidence draft tokens
499+
if (cur_p->data[0].p < sparams.p_min) {
501500
break;
502501
}
503502

504-
// only collect very high-confidence draft tokens
505-
if (cur_p->data[0].p < sparams.p_min) {
503+
result.push_back(id);
504+
505+
if (sparams.n_max <= (int) result.size()) {
506506
break;
507507
}
508508

ggml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
55
### GGML Version
66
set(GGML_VERSION_MAJOR 0)
77
set(GGML_VERSION_MINOR 10)
8-
set(GGML_VERSION_PATCH 0)
8+
set(GGML_VERSION_PATCH 1)
99
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
1010

1111
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

ggml/src/ggml-backend-meta.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
18261826
continue;
18271827
}
18281828

1829-
i = get_i_delayed(i);
1829+
const int i_delayed = get_i_delayed(i);
1830+
1831+
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
1832+
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
1833+
// its compute flag disabled and thus gets its data zeroed out.
1834+
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
1835+
if (i_delayed > i) {
1836+
for (size_t j = 0; j < n_backends; j++) {
1837+
auto & bcj = backend_ctx->backend_configs[j];
1838+
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
1839+
for (int ii = i + 1; ii <= i_delayed; ii++) {
1840+
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
1841+
}
1842+
}
1843+
}
1844+
}
1845+
1846+
i = i_delayed;
18301847

18311848
for (size_t j = 0; j < n_backends; j++) {
18321849
auto & bcj = backend_ctx->backend_configs[j];

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
485485
if (GGML_RV_ZIHINTPAUSE)
486486
string(APPEND MARCH_STR "_zihintpause")
487487
endif()
488+
if (GGML_CPU_RISCV64_SPACEMIT)
489+
# `xsmtvdotii' is only required for GCC >= 15.
490+
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
491+
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
492+
string(APPEND MARCH_STR "_xsmtvdotii")
493+
endif()
494+
endif()
488495

489496
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
490497
else()

0 commit comments

Comments
 (0)