support outro but don't actually use it yet

LostRuins · LostRuins · commit 919a010ebcac · 2026-04-10T23:18:30.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -3332,13 +3332,14 @@ int GetThreadsToUse(bool blasmode)
 }
 
 //this function prepares the clip embds for llava. it's only needed when images change
-static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro)
+static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro, const std::vector<int> & media_outro)
 {
     bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr);
     bool audio_on = (clp_ctx_a != nullptr);
     if (vision_on || audio_on)
     {
         int introsize = media_intro.size();
+        int outrosize = media_outro.size();
         last_media_mem.clear();
 
         for(int i=0;i<media_objects.size();++i)
@@ -3373,7 +3374,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
                         int tokcnt = (chunk.clp_image_tokens + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
                         if(i==0)
                         {
-                            tokcnt += introsize;
+                            tokcnt += introsize + outrosize;
                         }
                         for(int n=0;n<tokcnt;++n)
                         {
@@ -3425,7 +3426,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
                     int tokcnt = (cliptokensneeded + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
                     if(i==0)
                     {
-                        tokcnt += introsize;
+                        tokcnt += introsize + outrosize;
                     }
                     for(int n=0;n<tokcnt;++n)
                     {
@@ -3627,6 +3628,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     std::string addedmemory = inputs.memory;
     std::string negative_prompt = inputs.negative_prompt;
 
+    std::vector<int> media_intro; //added before media list
+    std::vector<int> media_outro; //added before media list
+    TokenizeString("\nAttached Media:\n", media_intro, file_format, true);
+
     //clear previous run llava embd memory, just-in-time free
     for(int i=0;i<media_objects.size();++i)
     {
@@ -3918,22 +3923,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     // tokenize the prompt
     std::vector<int> embd_inp;
     std::vector<int> embd_inp_mem; //for storing added memory
-    std::vector<int> media_intro; //added before media list
     std::vector<int> guidance_embd; //holds the guidance prompt
     bool media_embds_built = false;
 
     int32_t nctx = kcpp_data->n_ctx;
 
     TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
-    TokenizeString("\nAttached Media:\n", media_intro, file_format, true);
 
     if(media_composite_image_signature=="")
     {
         last_media_mem.clear();
     }
     if(media_data_changed)
     {
-        PrepareMediaEmbds(nctx, media_intro);
+        PrepareMediaEmbds(nctx, media_intro, media_outro);
         media_embds_built = true;
     }
 
@@ -5057,7 +5060,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 {
                     if(!media_embds_built) //this should never happen! however, handle it anyway
                     {
-                        PrepareMediaEmbds(nctx, media_intro);
+                        PrepareMediaEmbds(nctx, media_intro, media_outro);
                         media_embds_built = true;
                         printf("\nSomehow media embeds was not prepared (maybe no fast forward), rebuilding it...\n");
                     }
@@ -5073,6 +5076,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                         int llavatokenscounted = 0;
                         int llavatokensevaled = 0;
                         int introsize = media_intro.size();
+                        int outrosize = media_outro.size();
                         while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
                         {
                             if (!last_n_tokens.empty())
@@ -5162,6 +5166,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                                 llavatokensevaled += end_size;
                             }
                         }
+                        if(media_objects.size()>0 && outrosize>0)
+                        {
+                            //added after all media but before prompt
+                            kcpp_embd_batch batch = kcpp_embd_batch(media_outro, n_past, use_mrope, false);
+                            auto evr = llama_decode(llama_ctx_v4, batch.batch);
+                            if(evr!=0)
+                            {
+                                printf("\nError when appending media outro: %d\n",evr);
+                            }
+                            else
+                            {
+                                printf("\rProcessing Media Outro (%d tokens)",outrosize);
+                            }
+                            n_past += outrosize;
+                            llavatokensevaled += outrosize;
+                        }
                         if(llavatokenscounted!=llavatokensevaled)
                         {
                             media_composite_image_signature = ""; //force invalidate

Original file line number	Diff line number	Diff line change
`@@ -3332,13 +3332,14 @@ int GetThreadsToUse(bool blasmode)`
`3332`	`3332`	`}`
`3333`	`3333`
`3334`	`3334`	`//this function prepares the clip embds for llava. it's only needed when images change`
`3335`		`-static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro)`
	`3335`	`+static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro, const std::vector<int> & media_outro)`
`3336`	`3336`	`{`
`3337`	`3337`	`bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr);`
`3338`	`3338`	`bool audio_on = (clp_ctx_a != nullptr);`
`3339`	`3339`	`if (vision_on \|\| audio_on)`
`3340`	`3340`	`{`
`3341`	`3341`	`int introsize = media_intro.size();`
	`3342`	`+ int outrosize = media_outro.size();`
`3342`	`3343`	`last_media_mem.clear();`
`3343`	`3344`
`3344`	`3345`	`for(int i=0;i<media_objects.size();++i)`
`@@ -3373,7 +3374,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int`
`3373`	`3374`	`int tokcnt = (chunk.clp_image_tokens + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());`
`3374`	`3375`	`if(i==0)`
`3375`	`3376`	`{`
`3376`		`- tokcnt += introsize;`
	`3377`	`+ tokcnt += introsize + outrosize;`
`3377`	`3378`	`}`
`3378`	`3379`	`for(int n=0;n<tokcnt;++n)`
`3379`	`3380`	`{`
`@@ -3425,7 +3426,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int`
`3425`	`3426`	`int tokcnt = (cliptokensneeded + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());`
`3426`	`3427`	`if(i==0)`
`3427`	`3428`	`{`
`3428`		`- tokcnt += introsize;`
	`3429`	`+ tokcnt += introsize + outrosize;`
`3429`	`3430`	`}`
`3430`	`3431`	`for(int n=0;n<tokcnt;++n)`
`3431`	`3432`	`{`
`@@ -3627,6 +3628,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3627`	`3628`	`std::string addedmemory = inputs.memory;`
`3628`	`3629`	`std::string negative_prompt = inputs.negative_prompt;`
`3629`	`3630`
	`3631`	`+ std::vector<int> media_intro; //added before media list`
	`3632`	`+ std::vector<int> media_outro; //added before media list`
	`3633`	`+ TokenizeString("\nAttached Media:\n", media_intro, file_format, true);`
	`3634`	`+`
`3630`	`3635`	`//clear previous run llava embd memory, just-in-time free`
`3631`	`3636`	`for(int i=0;i<media_objects.size();++i)`
`3632`	`3637`	`{`
`@@ -3918,22 +3923,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3918`	`3923`	`// tokenize the prompt`
`3919`	`3924`	`std::vector<int> embd_inp;`
`3920`	`3925`	`std::vector<int> embd_inp_mem; //for storing added memory`
`3921`		`- std::vector<int> media_intro; //added before media list`
`3922`	`3926`	`std::vector<int> guidance_embd; //holds the guidance prompt`
`3923`	`3927`	`bool media_embds_built = false;`
`3924`	`3928`
`3925`	`3929`	`int32_t nctx = kcpp_data->n_ctx;`
`3926`	`3930`
`3927`	`3931`	`TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);`
`3928`		`- TokenizeString("\nAttached Media:\n", media_intro, file_format, true);`
`3929`	`3932`
`3930`	`3933`	`if(media_composite_image_signature=="")`
`3931`	`3934`	`{`
`3932`	`3935`	`last_media_mem.clear();`
`3933`	`3936`	`}`
`3934`	`3937`	`if(media_data_changed)`
`3935`	`3938`	`{`
`3936`		`- PrepareMediaEmbds(nctx, media_intro);`
	`3939`	`+ PrepareMediaEmbds(nctx, media_intro, media_outro);`
`3937`	`3940`	`media_embds_built = true;`
`3938`	`3941`	`}`
`3939`	`3942`
`@@ -5057,7 +5060,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`5057`	`5060`	`{`
`5058`	`5061`	`if(!media_embds_built) //this should never happen! however, handle it anyway`
`5059`	`5062`	`{`
`5060`		`- PrepareMediaEmbds(nctx, media_intro);`
	`5063`	`+ PrepareMediaEmbds(nctx, media_intro, media_outro);`
`5061`	`5064`	`media_embds_built = true;`
`5062`	`5065`	`printf("\nSomehow media embeds was not prepared (maybe no fast forward), rebuilding it...\n");`
`5063`	`5066`	`}`
`@@ -5073,6 +5076,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`5073`	`5076`	`int llavatokenscounted = 0;`
`5074`	`5077`	`int llavatokensevaled = 0;`
`5075`	`5078`	`int introsize = media_intro.size();`
	`5079`	`+ int outrosize = media_outro.size();`
`5076`	`5080`	`while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A \|\| embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))`
`5077`	`5081`	`{`
`5078`	`5082`	`if (!last_n_tokens.empty())`
`@@ -5162,6 +5166,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`5162`	`5166`	`llavatokensevaled += end_size;`
`5163`	`5167`	`}`
`5164`	`5168`	`}`
	`5169`	`+ if(media_objects.size()>0 && outrosize>0)`
	`5170`	`+ {`
	`5171`	`+ //added after all media but before prompt`
	`5172`	`+ kcpp_embd_batch batch = kcpp_embd_batch(media_outro, n_past, use_mrope, false);`
	`5173`	`+ auto evr = llama_decode(llama_ctx_v4, batch.batch);`
	`5174`	`+ if(evr!=0)`
	`5175`	`+ {`
	`5176`	`+ printf("\nError when appending media outro: %d\n",evr);`
	`5177`	`+ }`
	`5178`	`+ else`
	`5179`	`+ {`
	`5180`	`+ printf("\rProcessing Media Outro (%d tokens)",outrosize);`
	`5181`	`+ }`
	`5182`	`+ n_past += outrosize;`
	`5183`	`+ llavatokensevaled += outrosize;`
	`5184`	`+ }`
`5165`	`5185`	`if(llavatokenscounted!=llavatokensevaled)`
`5166`	`5186`	`{`
`5167`	`5187`	`media_composite_image_signature = ""; //force invalidate`