ggml-org
diff --git a/‎common/arg.cpp‎
Lines changed: 16 additions & 2 deletions b/‎common/arg.cpp‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎common/common.cpp‎
Lines changed: 0 additions & 23 deletions b/‎common/common.cpp‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎common/common.h‎
Lines changed: 4 additions & 4 deletions b/‎common/common.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎common/ngram-map.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/ngram-map.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -536,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                 throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
             }
             if (!seen_args.insert(arg).second) {
-                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+                const bool skip = (arg == "--spec-type");
+
+                if (!skip) {
+                    LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+                }
             }
             auto & tmp = arg_to_options[arg];
             auto opt = *tmp.first;
@@ -893,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
             throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
         }
         if (!seen_args.insert(arg).second) {
-            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+            const bool skip = (arg == "--spec-type");
+
+            if (!skip) {
+                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+            }
         }
         auto opt = *arg_to_options[arg];
         std::string val;
@@ -4117,6 +4125,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.ngram_mod.n_match = 24;
             params.speculative.ngram_mod.n_min = 48;
             params.speculative.ngram_mod.n_max = 64;
+
+            // TODO: not sure if this is a good config - explore more settings and potentially enable it
+            //params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+            //params.speculative.ngram_map_k4v.size_n = 8;
+            //params.speculative.ngram_map_k4v.size_m = 24;
+            //params.speculative.ngram_map_k4v.min_hits = 2;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
 
@@ -1256,29 +1256,6 @@ common_init_result::common_init_result(common_params & params, bool model_only)
         cparams.n_samplers = pimpl->samplers_seq_config.size();
     }
 
-    // [TAG_RS_STATE_ROLLBACK_SUPPORT]
-    // TODO: ngram speculative methods require checkpointing in addition to partial RS rollback
-    //       currently this is not supported. so we disable the partial rollback
-    if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) {
-        auto & types = params.speculative.types;
-
-        for (int i = 0; i < (int) types.size(); i++) {
-            if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) {
-                continue;
-            }
-            if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) {
-                continue;
-            }
-
-            cparams.n_rs_seq = 0;
-
-            LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__,
-                    common_speculative_type_to_str(types[i]).c_str());
-
-            break;
-        }
-    }
-
     llama_context * lctx = llama_init_from_model(model, cparams);
     if (lctx == NULL) {
         LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
 
@@ -299,11 +299,11 @@ struct common_params_model {
 
 // draft-model-based speculative decoding parameters
 struct common_params_speculative_draft {
-    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min = 0;  // minimum number of draft tokens to use for speculative decoding
+    int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
 
-    float p_split = 0.1f;  // speculative decoding split probability
-    float p_min   = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f
+    float p_split = 0.1f; // speculative decoding split probability
+    float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
 
     common_params_model mparams;
 
 
@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,
         draft.push_back(inp[match_pos + n + i]);
     }
 
-    LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
+    LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
             key_offset, slot_max,
             curr_key.key_num, draft.size());
Original file line number	Diff line number	Diff line change
`@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,`
`500`	`500`	`draft.push_back(inp[match_pos + n + i]);`
`501`	`501`	`}`
`502`	`502`
`503`		`- LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,`
	`503`	`+ LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,`
`504`	`504`	`key_offset, slot_max,`
`505`	`505`	`curr_key.key_num, draft.size());`
`506`	`506`