janhq
diff --git a/‎common/log.cpp‎
Lines changed: 11 additions & 8 deletions b/‎common/log.cpp‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎common/log.h‎
Lines changed: 5 additions & 1 deletion b/‎common/log.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎common/reasoning-budget.cpp‎
Lines changed: 0 additions & 28 deletions b/‎common/reasoning-budget.cpp‎
Lines changed: 0 additions & 28 deletions
diff --git a/‎common/reasoning-budget.h‎
Lines changed: 2 additions & 15 deletions b/‎common/reasoning-budget.h‎
Lines changed: 2 additions & 15 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 34 additions & 25 deletions b/‎common/sampling.cpp‎
Lines changed: 34 additions & 25 deletions
diff --git a/‎common/sampling.h‎
Lines changed: 2 additions & 2 deletions b/‎common/sampling.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 6 additions & 6 deletions b/‎common/speculative.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎ggml/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 18 additions & 1 deletion b/‎ggml/src/ggml-backend-meta.cpp‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎ggml/src/ggml-cpu/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
@@ -49,7 +49,7 @@ enum common_log_col : int {
 };
 
 // disable colors by default
-static std::vector<const char *> g_col = {
+static const char* g_col[] = {
     "",
     "",
     "",
@@ -247,7 +247,6 @@ struct common_log {
 
             entries = std::move(new_entries);
         }
-
         cv.notify_one();
     }
 
@@ -265,7 +264,6 @@ struct common_log {
                 {
                     std::unique_lock<std::mutex> lock(mtx);
                     cv.wait(lock, [this]() { return head != tail; });
-
                     cur = entries[head];
 
                     head = (head + 1) % entries.size();
@@ -301,7 +299,6 @@ struct common_log {
 
                 tail = (tail + 1) % entries.size();
             }
-
             cv.notify_one();
         }
 
@@ -338,7 +335,7 @@ struct common_log {
             g_col[COMMON_LOG_COL_CYAN]    = LOG_COL_CYAN;
             g_col[COMMON_LOG_COL_WHITE]   = LOG_COL_WHITE;
         } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
+            for (size_t i = 0; i < std::size(g_col); i++) {
                 g_col[i] = "";
             }
         }
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
 }
 
 struct common_log * common_log_main() {
-    static struct common_log log;
+    // We intentionally leak (i.e. do not delete) the logger singleton because
+    // common_log destructor called at DLL teardown phase will cause hanging on Windows.
+    // OS will release resources anyway so it should not be a significant issue,
+    // though this design may cause logs to be lost if not flushed before the program exits.
+    // Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
+    static struct common_log * log;
     static std::once_flag    init_flag;
     std::call_once(init_flag, [&]() {
+        log = new common_log;
         // Set default to auto-detect colors
-        log.set_colors(tty_can_use_colors());
+        log->set_colors(tty_can_use_colors());
     });
 
-    return &log;
+    return log;
 }
 
 void common_log_pause(struct common_log * log) {
 
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
 struct common_log;
 
 struct common_log * common_log_init();
-struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
+
+// Singleton, intentionally leaked to avoid Windows teardown hangs.
+// Call common_log_flush() before exit if you want to ensure all logs are flushed.
+struct common_log * common_log_main();
+
 void                common_log_pause (struct common_log * log); // pause  the worker thread, not thread-safe
 void                common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
 void                common_log_free  (struct common_log * log);
 
@@ -232,34 +232,6 @@ static struct llama_sampler * common_reasoning_budget_init_state(
     );
 }
 
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        const std::vector<llama_token> & prefill_tokens) {
-    // Determine initial state from prefill: COUNTING if the prefill begins with
-    // the start sequence but does not also contain the end sequence after it.
-    common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
-    if (!prefill_tokens.empty() && !start_tokens.empty() &&
-            prefill_tokens.size() >= start_tokens.size() &&
-            std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
-        initial_state = REASONING_BUDGET_COUNTING;
-        // If the end sequence also follows the start in the prefill, reasoning
-        // was opened and immediately closed — stay IDLE.
-        if (!end_tokens.empty() &&
-                prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
-            auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
-            if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
-                    std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
-                initial_state = REASONING_BUDGET_IDLE;
-            }
-        }
-    }
-    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
-}
-
 struct llama_sampler * common_reasoning_budget_init(
         const struct llama_vocab       * vocab,
         const std::vector<llama_token> & start_tokens,
 
@@ -29,27 +29,14 @@ enum common_reasoning_budget_state {
 //   end_tokens     - token sequence for natural deactivation
 //   forced_tokens  - token sequence forced when budget expires
 //   budget         - max tokens allowed in the reasoning block
-//   prefill_tokens - tokens already present in the prompt (generation prompt);
-//                    used to determine the initial state: COUNTING if they begin
-//                    with start_tokens (but don't also end with end_tokens),
-//                    IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
+//   initial_state  - initial state
 //
 struct llama_sampler * common_reasoning_budget_init(
         const struct llama_vocab       * vocab,
         const std::vector<llama_token> & start_tokens,
         const std::vector<llama_token> & end_tokens,
         const std::vector<llama_token> & forced_tokens,
         int32_t                          budget,
-        const std::vector<llama_token> & prefill_tokens = {});
-
-// Variant that takes an explicit initial state (used by tests and clone).
-// COUNTING with budget <= 0 is promoted to FORCING.
-struct llama_sampler * common_reasoning_budget_init(
-        const struct llama_vocab       * vocab,
-        const std::vector<llama_token> & start_tokens,
-        const std::vector<llama_token> & end_tokens,
-        const std::vector<llama_token> & forced_tokens,
-        int32_t                          budget,
-        common_reasoning_budget_state    initial_state);
+        common_reasoning_budget_state    initial_state = REASONING_BUDGET_IDLE);
 
 common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
@@ -260,32 +260,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
         }
     }
 
-    // Feed generation prompt tokens to the grammar sampler so it advances past
-    // tokens the template already placed in the prompt.
-    // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
+    // Compute prefill tokens from the generation prompt
     std::vector<llama_token> prefill_tokens;
-    if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
+    if (!params.generation_prompt.empty()) {
         GGML_ASSERT(vocab != nullptr);
-        prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
-        if (!prefill_tokens.empty()) {
-            std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
-            if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
-                // Some tokenizers will add a space before the first special token, need to remove
-                prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
+        auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
+        for (size_t i = 0; i < tokens.size(); i++) {
+            std::string piece = common_token_to_piece(vocab, tokens[i], true);
+            if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
+                // Some tokenizers will add a space before the first special token, need to exclude
+                continue;
             }
+            LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
+            prefill_tokens.push_back(tokens[i]);
         }
+    }
 
-        if (grmr && !params.grammar_lazy) {
-            try {
-                for (const auto & token : prefill_tokens) {
-                    llama_sampler_accept(grmr, token);
-                    LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
-                }
-            } catch (std::exception &e) {
-                LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
-                    common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
-                throw e;
+    // Feed generation prompt tokens to the grammar sampler so it advances past
+    // tokens the template already placed in the prompt.
+    // Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
+    if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
+        try {
+            for (const auto & token : prefill_tokens) {
+                llama_sampler_accept(grmr, token);
+                LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
             }
+        } catch (std::exception &e) {
+            LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
+                common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
+            throw e;
         }
     }
 
@@ -296,8 +299,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
             params.reasoning_budget_start,
             params.reasoning_budget_end,
             params.reasoning_budget_forced,
-            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
-            prefill_tokens);
+            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
+
+        for (const auto & token : prefill_tokens) {
+            llama_sampler_accept(rbudget, token);
+            LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
+        }
     }
 
     if (params.has_logit_bias()) {
@@ -431,17 +438,19 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
     return true;
 }
 
-void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
     if (!gsmpl) {
         return;
     }
 
     const auto tm = gsmpl->tm();
 
     // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
-    accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
+    const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
 
-    llama_sampler_accept(gsmpl->rbudget, token);
+    if (gsmpl->rbudget && is_generated) {
+        llama_sampler_accept(gsmpl->rbudget, token);
+    }
 
     if (gsmpl->grmr && accept_grammar) {
         llama_sampler_accept(gsmpl->grmr, token);
 
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
 
 void common_sampler_free(struct common_sampler * gsmpl);
 
-// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 
 
@@ -467,7 +467,7 @@ struct common_speculative_state_draft : public common_speculative_state {
 
         prompt_dft.push_back(id_last);
 
-        LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
+        //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
 
         int ret = llama_decode(ctx_dft, batch);
         if (ret != 0 && ret != 1) {
@@ -495,14 +495,14 @@ struct common_speculative_state_draft : public common_speculative_state {
 
             common_sampler_accept(smpl, id, true);
 
-            result.push_back(id);
-
-            if (sparams.n_max <= (int) result.size()) {
+            // only collect very high-confidence draft tokens
+            if (cur_p->data[0].p < sparams.p_min) {
                 break;
             }
 
-            // only collect very high-confidence draft tokens
-            if (cur_p->data[0].p < sparams.p_min) {
+            result.push_back(id);
+
+            if (sparams.n_max <= (int) result.size()) {
                 break;
             }
 
 
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 10)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
@@ -1826,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
                     continue;
                 }
 
-                i = get_i_delayed(i);
+                const int i_delayed = get_i_delayed(i);
+
+                // If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
+                // A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
+                //     its compute flag disabled and thus gets its data zeroed out.
+                // If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
+                if (i_delayed > i) {
+                    for (size_t j = 0; j < n_backends; j++) {
+                        auto & bcj = backend_ctx->backend_configs[j];
+                        if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                            for (int ii = i + 1; ii <= i_delayed; ii++) {
+                                bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
+                            }
+                        }
+                    }
+                }
+
+                i = i_delayed;
 
                 for (size_t j = 0; j < n_backends; j++) {
                     auto & bcj = backend_ctx->backend_configs[j];
 
@@ -485,6 +485,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             if (GGML_RV_ZIHINTPAUSE)
                 string(APPEND MARCH_STR "_zihintpause")
             endif()
+            if (GGML_CPU_RISCV64_SPACEMIT)
+                # `xsmtvdotii' is only required for GCC >= 15.
+                if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
+                    CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
+                    string(APPEND MARCH_STR "_xsmtvdotii")
+                endif()
+            endif()
 
             list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
         else()