Merge branch 'upstream' into concedo_experimental

LostRuins · LostRuins · commit 376aaf258c93 · 2026-04-04T23:56:02.000+08:00
diff --git a/common/download.cpp b/common/download.cpp
@@ -600,9 +600,12 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
         }
     }
 
-    for (const auto & f : files) {
-        if (gguf_filename_is_model(f.path)) {
-            return f;
+    // fallback to first available model only if tag is empty
+    if (tag.empty()) {
+        for (const auto & f : files) {
+            if (gguf_filename_is_model(f.path)) {
+                return f;
+            }
         }
     }
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1394,6 +1394,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,  hparams.n_embd_per_layer);
                 ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,    hparams.n_embd_head_k_swa);
                 ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
 
                 switch (hparams.n_layer) {
                     case 35: type = LLM_TYPE_E2B; break;
diff --git a/src/unicode.cpp b/src/unicode.cpp
@@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
     return bpe_offsets;
 }
 
+// regex: [^\n]+|[\n]+
+// splits text into runs of non-newline characters and runs of newline characters
+static std::vector<size_t> unicode_regex_split_custom_newlines(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+    bpe_offsets.reserve(offsets.size());
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        size_t pos = offset_ini;
+        while (pos < offset_end) {
+            const bool is_newline = (cpts[pos] == '\n');
+            const size_t run_start = pos;
+            while (pos < offset_end && (cpts[pos] == '\n') == is_newline) {
+                pos++;
+            }
+            bpe_offsets.push_back(pos - run_start);
+        }
+    }
+
+    return bpe_offsets;
+}
+
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
     std::vector<size_t> bpe_offsets;
 
@@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
     } else if (regex_expr == "\\p{AFMoE_digits}") {
         // AFMOE digit pattern - use custom implementation for proper splitting
         bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+    } else if (regex_expr == "[^\\n]+|[\\n]+") {
+        bpe_offsets = unicode_regex_split_custom_newlines(text, offsets);
     } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
         // tiny_aya digit grouping pattern from tokenizer.json:
         //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -155,8 +155,8 @@ struct server_slot {
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
 
-    double t_prompt_processing; // ms
-    double t_token_generation;  // ms
+    double t_prompt_processing = 0.0; // ms
+    double t_token_generation = 0.0;  // ms
 
     std::function<void(int /* id_slot */)> callback_on_release;
 
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
@@ -261,14 +261,14 @@ struct result_timings {
     int32_t cache_n = -1;
 
     int32_t prompt_n = -1;
-    double prompt_ms;
-    double prompt_per_token_ms;
-    double prompt_per_second;
+    double prompt_ms = 0.0;
+    double prompt_per_token_ms = 0.0;
+    double prompt_per_second = 0.0;
 
     int32_t predicted_n = -1;
-    double predicted_ms;
-    double predicted_per_token_ms;
-    double predicted_per_second;
+    double predicted_ms = 0.0;
+    double predicted_per_token_ms = 0.0;
+    double predicted_per_second = 0.0;
 
     // Optional speculative metrics - only included when > 0
     int32_t draft_n = 0;

Original file line number	Diff line number	Diff line change
`@@ -600,9 +600,12 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,`
`600`	`600`	`}`
`601`	`601`	`}`
`602`	`602`
`603`		`- for (const auto & f : files) {`
`604`		`- if (gguf_filename_is_model(f.path)) {`
`605`		`- return f;`
	`603`	`+ // fallback to first available model only if tag is empty`
	`604`	`+ if (tag.empty()) {`
	`605`	`+ for (const auto & f : files) {`
	`606`	`+ if (gguf_filename_is_model(f.path)) {`
	`607`	`+ return f;`
	`608`	`+ }`
`606`	`609`	`}`
`607`	`610`	`}`
`608`	`611`