@@ -194,15 +194,18 @@ static void TokenizeString(const std::string & str_to_tokenize, std::vector<int>
194194 if (add_bos)
195195 {
196196 llama_token bostoadd = llama_token_bos (&(llama_ctx_v4->model ));
197- if (output_tokens. size ()== 0 )
197+ if (bostoadd != LLAMA_TOKEN_NULL) // if bos does not exist, do not add it
198198 {
199- output_tokens.push_back (bostoadd);
200- }
201- else
202- {
203- if (output_tokens[0 ]!=bostoadd)
199+ if (output_tokens.size ()==0 )
204200 {
205- output_tokens.insert (output_tokens.begin (), 1 , bostoadd);
201+ output_tokens.push_back (bostoadd);
202+ }
203+ else
204+ {
205+ if (output_tokens[0 ]!=bostoadd)
206+ {
207+ output_tokens.insert (output_tokens.begin (), 1 , bostoadd);
208+ }
206209 }
207210 }
208211 }
@@ -1870,6 +1873,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
18701873 }
18711874 }
18721875
1876+ if (file_format_meta.model_architecture ==GGUFArch::ARCH_RWKV)
1877+ {
1878+ printf (" \n RWKV6 Overriding EOS and BOS IDs to 0\n " );
1879+ llamamodel->vocab .special_bos_id = llamamodel->vocab .special_eos_id = 0 ;
1880+ }
1881+
18731882 llama_ctx_params.flash_attn = kcpp_params->flash_attn ;
18741883 llama_ctx_params.type_k = (inputs.quant_k >1 ?GGML_TYPE_Q4_0:(inputs.quant_k ==1 ?GGML_TYPE_Q8_0:GGML_TYPE_F16));
18751884 llama_ctx_params.type_v = (inputs.quant_v >1 ?GGML_TYPE_Q4_0:(inputs.quant_v ==1 ?GGML_TYPE_Q8_0:GGML_TYPE_F16));
@@ -3085,7 +3094,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
30853094 if (!inputs.allow_eos_token && !inputs.bypass_eos_token )
30863095 {
30873096 // set the logit of the eos token to very low to avoid sampling it
3088- logitsPtr[eosID] = lowestLogit;
3097+ if (eosID!=LLAMA_TOKEN_NULL)
3098+ {
3099+ logitsPtr[eosID] = lowestLogit;
3100+ }
30893101 if (eotID!=-1 )
30903102 {
30913103 logitsPtr[eotID] = lowestLogit;
0 commit comments