added mechanics for a full clear if fast forward is not used, this should help recover from bad states

LostRuins · LostRuins · commit b867b67e7e47 · 2025-12-05T16:43:37.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -3782,7 +3782,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         {
             if(kcpp_data->use_fastforward)
             {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
             }
         }
         if(is_recurrent)
@@ -3830,12 +3830,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             }
             if(kcpp_data->use_fastforward)
             {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
             }
         }
         if(file_format == FileFormat::GGUF_GENERIC)
         {
-            llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);
+            if(n_past==0) //force full clear
+            {
+                llama_memory_clear(llama_get_memory(llama_ctx_v4),true);
+            }
+            else
+            {
+                llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);
+            }
             if(draft_ctx)
             {
                 llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, n_past, -1);
diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -513,7 +513,7 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
 
  void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
  int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- bool useSmartContext, const bool requireFullSubset)
+ bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
  {
      const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
      const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
@@ -568,6 +568,13 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
         }
     }
 
+    if(n_past < minimum_to_proceed) //too few tokens to fast forward, so lets start fresh
+    {
+        last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());
+        n_past = 0;
+        fastforwardok = false;
+    }
+
     if(fastforwardok)
     {
         last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
diff --git a/model_adapter.h b/model_adapter.h
@@ -129,7 +129,7 @@ int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> se
 FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
 void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
  int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- const bool useSmartContext, const bool requireFullSubset);
+ const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
 bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
 std::string gguf_get_model_arch(const std::string & filename);
 

Original file line number	Diff line number	Diff line change
`@@ -3782,7 +3782,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3782`	`3782`	`{`
`3783`	`3783`	`if(kcpp_data->use_fastforward)`
`3784`	`3784`	`{`
`3785`		`- ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true);`
	`3785`	`+ ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);`
`3786`	`3786`	`}`
`3787`	`3787`	`}`
`3788`	`3788`	`if(is_recurrent)`
`@@ -3830,12 +3830,19 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3830`	`3830`	`}`
`3831`	`3831`	`if(kcpp_data->use_fastforward)`
`3832`	`3832`	`{`
`3833`		`- ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false);`
	`3833`	`+ ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);`
`3834`	`3834`	`}`
`3835`	`3835`	`}`
`3836`	`3836`	`if(file_format == FileFormat::GGUF_GENERIC)`
`3837`	`3837`	`{`
`3838`		`- llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);`
	`3838`	`+ if(n_past==0) //force full clear`
	`3839`	`+ {`
	`3840`	`+ llama_memory_clear(llama_get_memory(llama_ctx_v4),true);`
	`3841`	`+ }`
	`3842`	`+ else`
	`3843`	`+ {`
	`3844`	`+ llama_memory_seq_rm(llama_get_memory(llama_ctx_v4), 0, n_past, -1);`
	`3845`	`+ }`
`3839`	`3846`	`if(draft_ctx)`
`3840`	`3847`	`{`
`3841`	`3848`	`llama_memory_seq_rm(llama_get_memory(draft_ctx), 0, n_past, -1);`
Original file line number	Diff line number	Diff line change
`@@ -513,7 +513,7 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)`
`513`	`513`
`514`	`514`	`void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,`
`515`	`515`	`int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,`
`516`		`- bool useSmartContext, const bool requireFullSubset)`
	`516`	`+ bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)`
`517`	`517`	`{`
`518`	`518`	`const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext`
`519`	`519`	`const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext`
`@@ -568,6 +568,13 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)`
`568`	`568`	`}`
`569`	`569`	`}`
`570`	`570`
	`571`	`+ if(n_past < minimum_to_proceed) //too few tokens to fast forward, so lets start fresh`
	`572`	`+ {`
	`573`	`+ last_n_tokens.erase(last_n_tokens.end() - n_past, last_n_tokens.end());`
	`574`	`+ n_past = 0;`
	`575`	`+ fastforwardok = false;`
	`576`	`+ }`
	`577`	`+`
`571`	`578`	`if(fastforwardok)`
`572`	`579`	`{`
`573`	`580`	`last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);`