TheTom
diff --git a/‎common/arg.cpp‎
Lines changed: 12 additions & 1 deletion b/‎common/arg.cpp‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎common/chat.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/chat.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/common.cpp‎
Lines changed: 37 additions & 1 deletion b/‎common/common.cpp‎
Lines changed: 37 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 21 additions & 6 deletions b/‎common/common.h‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎common/hf-cache.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/hf-cache.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/ngram-map.cpp‎
Lines changed: 4 additions & 4 deletions b/‎common/ngram-map.cpp‎
Lines changed: 4 additions & 4 deletions
@@ -291,7 +291,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
         hf_tag = "default";
     }
 
-    std::string model_endpoint = get_model_endpoint();
+    std::string model_endpoint = common_get_model_endpoint();
     auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
 
     // prepare local path for caching
@@ -3890,6 +3890,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
+    add_opt(common_arg(
+        {"--spec-default"},
+        string_format("enable default speculative decoding config"),
+        [](common_params & params) {
+            params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
+            params.speculative.ngram_size_n = 24;
+            params.speculative.n_min = 48;
+            params.speculative.n_max = 64;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+
     return ctx_arg;
 }
 
 
@@ -2334,7 +2334,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
         ? input
         : params.generation_prompt + input;
 
-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
+    //LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
 
     common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
     if (params.debug) {
 
@@ -1387,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
 
 common_init_result::~common_init_result() = default;
 
-std::string get_model_endpoint() {
+std::string common_get_model_endpoint() {
     const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
     // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
     const char * hf_endpoint_env = getenv("HF_ENDPOINT");
@@ -1402,6 +1402,42 @@ std::string get_model_endpoint() {
     return model_endpoint;
 }
 
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
+    auto * mem = llama_get_memory(ctx);
+    if (mem == nullptr) {
+        return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+    }
+
+    common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
+
+    llama_memory_clear(mem, true);
+
+    // eval 2 tokens to check if the context is compatible
+    std::vector<llama_token> tmp;
+    tmp.push_back(0);
+    tmp.push_back(0);
+
+    int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
+    if (ret != 0) {
+        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
+        res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
+        goto done;
+    }
+
+    // try to remove the last tokens
+    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
+        LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
+        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+        goto done;
+    }
+
+done:
+    llama_memory_clear(mem, true);
+    llama_synchronize(ctx);
+
+    return res;
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
     std::vector<llama_adapter_lora *> loras;
     std::vector<float> scales;
 
@@ -10,7 +10,6 @@
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 #include <map>
 
@@ -315,15 +314,15 @@ struct common_params_speculative {
     // general-purpose speculative decoding parameters
 
     int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min   = 0;  // minimum number of draft tokens to use for speculative decoding
     float   p_split = 0.1f; // speculative decoding split probability
     float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
 
     // ngram-based speculative decoding
 
-    uint16_t ngram_size_n     = 12; // ngram size for lookup
-    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
-    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+    uint16_t ngram_size_n   = 12; // ngram size for lookup
+    uint16_t ngram_size_m   = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
     std::shared_ptr<common_ngram_mod> ngram_mod;
 
@@ -859,7 +858,23 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
-std::string                   get_model_endpoint();
+// model endpoint from env
+std::string common_get_model_endpoint();
+
+//
+// Context utils
+//
+
+enum common_context_seq_rm_type {
+    COMMON_CONTEXT_SEQ_RM_TYPE_NO   = 0, // seq_rm not supported (e.g. no memory module)
+    COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
+    COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
+};
+
+// check if the llama_context can remove sequences
+// note: clears the memory of the context
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
+
 
 //
 // Batch utils
 
@@ -229,7 +229,7 @@ static nl::json api_get(const std::string & url,
 static std::string get_repo_commit(const std::string & repo_id,
                                    const std::string & token) {
     try {
-        auto endpoint = get_model_endpoint();
+        auto endpoint = common_get_model_endpoint();
         auto json = api_get(endpoint + "api/models/" + repo_id + "/refs", token);
 
         if (!json.is_object() ||
@@ -307,7 +307,7 @@ hf_files get_repo_files(const std::string & repo_id,
     hf_files files;
 
     try {
-        auto endpoint = get_model_endpoint();
+        auto endpoint = common_get_model_endpoint();
         auto json = api_get(endpoint + "api/models/" + repo_id + "/tree/" + commit + "?recursive=true", token);
 
         if (!json.is_array()) {
 
@@ -208,7 +208,7 @@ void common_ngram_map_begin(
                 count_keys, count_keys_del, count_values_del, count_map_entries_upd);
     }
 
-    map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    map.idx_last_check = size_begin;
     map.size_last_begin = size_begin;
 }
 
@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,
         GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
     }
 
-    if (map.idx_last_check  > cur_len) {
+    if (map.idx_last_check > cur_len) {
         // Should not happen because of common_ngram_map_begin().
         GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
     }
@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,
         LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
                 curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
 
-        map.last_draft_created   = false;
+        map.last_draft_created   = true;
         map.last_draft_key_idx   = key_offset;
         map.last_draft_value_idx = 0; // value 0 is used for simple mode
         return;
@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
     struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.
 
     // update the value statistics
-    LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
+    LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",
             n_accepted, curr_value.n_accepted);
     curr_value.n_accepted = n_accepted;
 }
Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@ void common_ngram_map_begin(`
`208`	`208`	`count_keys, count_keys_del, count_values_del, count_map_entries_upd);`
`209`	`209`	`}`
`210`	`210`
`211`		`- map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;`
	`211`	`+ map.idx_last_check = size_begin;`
`212`	`212`	`map.size_last_begin = size_begin;`
`213`	`213`	`}`
`214`	`214`
`@@ -231,7 +231,7 @@ void common_ngram_map_draft(common_ngram_map & map,`
`231`	`231`	`GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);`
`232`	`232`	`}`
`233`	`233`
`234`		`- if (map.idx_last_check > cur_len) {`
	`234`	`+ if (map.idx_last_check > cur_len) {`
`235`	`235`	`// Should not happen because of common_ngram_map_begin().`
`236`	`236`	`GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);`
`237`	`237`	`}`
`@@ -386,7 +386,7 @@ void common_ngram_map_draft(common_ngram_map & map,`
`386`	`386`	`LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,`
`387`	`387`	`curr_key.key_idx, key_offset, curr_key.key_num, draft.size());`
`388`	`388`
`389`		`- map.last_draft_created = false;`
	`389`	`+ map.last_draft_created = true;`
`390`	`390`	`map.last_draft_key_idx = key_offset;`
`391`	`391`	`map.last_draft_value_idx = 0; // value 0 is used for simple mode`
`392`	`392`	`return;`
`@@ -524,7 +524,7 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {`
`524`	`524`	`struct common_ngram_map_value & curr_value = curr_key.values[val_idx]; // value used for draft generation.`
`525`	`525`
`526`	`526`	`// update the value statistics`
`527`		`- LOG_INF("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",`
	`527`	`+ LOG_DBG("common_ngram_map_send_accepted: n_accepted = %d, prev value_num = %d\n",`
`528`	`528`	`n_accepted, curr_value.n_accepted);`
`529`	`529`	`curr_value.n_accepted = n_accepted;`
`530`	`530`	`}`