@@ -3332,13 +3332,14 @@ int GetThreadsToUse(bool blasmode)
33323332}
33333333
33343334// this function prepares the clip embds for llava. it's only needed when images change
3335- static void PrepareMediaEmbds (const int nctx, const std::vector<int > & media_intro)
3335+ static void PrepareMediaEmbds (const int nctx, const std::vector<int > & media_intro, const std::vector< int > & media_outro )
33363336{
33373337 bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr );
33383338 bool audio_on = (clp_ctx_a != nullptr );
33393339 if (vision_on || audio_on)
33403340 {
33413341 int introsize = media_intro.size ();
3342+ int outrosize = media_outro.size ();
33423343 last_media_mem.clear ();
33433344
33443345 for (int i=0 ;i<media_objects.size ();++i)
@@ -3373,7 +3374,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
33733374 int tokcnt = (chunk.clp_image_tokens + media_objects[i].chunk_start_seq .size () + media_objects[i].chunk_end_seq .size ());
33743375 if (i==0 )
33753376 {
3376- tokcnt += introsize;
3377+ tokcnt += introsize + outrosize ;
33773378 }
33783379 for (int n=0 ;n<tokcnt;++n)
33793380 {
@@ -3425,7 +3426,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
34253426 int tokcnt = (cliptokensneeded + media_objects[i].chunk_start_seq .size () + media_objects[i].chunk_end_seq .size ());
34263427 if (i==0 )
34273428 {
3428- tokcnt += introsize;
3429+ tokcnt += introsize + outrosize ;
34293430 }
34303431 for (int n=0 ;n<tokcnt;++n)
34313432 {
@@ -3627,6 +3628,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
36273628 std::string addedmemory = inputs.memory ;
36283629 std::string negative_prompt = inputs.negative_prompt ;
36293630
3631+ std::vector<int > media_intro; // added before media list
3632+ std::vector<int > media_outro; // added before media list
3633+ TokenizeString (" \n Attached Media:\n " , media_intro, file_format, true );
3634+
36303635 // clear previous run llava embd memory, just-in-time free
36313636 for (int i=0 ;i<media_objects.size ();++i)
36323637 {
@@ -3918,22 +3923,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39183923 // tokenize the prompt
39193924 std::vector<int > embd_inp;
39203925 std::vector<int > embd_inp_mem; // for storing added memory
3921- std::vector<int > media_intro; // added before media list
39223926 std::vector<int > guidance_embd; // holds the guidance prompt
39233927 bool media_embds_built = false ;
39243928
39253929 int32_t nctx = kcpp_data->n_ctx ;
39263930
39273931 TokenizeString (kcpp_data->prompt , embd_inp, file_format, add_bos_token);
3928- TokenizeString (" \n Attached Media:\n " , media_intro, file_format, true );
39293932
39303933 if (media_composite_image_signature==" " )
39313934 {
39323935 last_media_mem.clear ();
39333936 }
39343937 if (media_data_changed)
39353938 {
3936- PrepareMediaEmbds (nctx, media_intro);
3939+ PrepareMediaEmbds (nctx, media_intro, media_outro );
39373940 media_embds_built = true ;
39383941 }
39393942
@@ -5057,7 +5060,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
50575060 {
50585061 if (!media_embds_built) // this should never happen! however, handle it anyway
50595062 {
5060- PrepareMediaEmbds (nctx, media_intro);
5063+ PrepareMediaEmbds (nctx, media_intro, media_outro );
50615064 media_embds_built = true ;
50625065 printf (" \n Somehow media embeds was not prepared (maybe no fast forward), rebuilding it...\n " );
50635066 }
@@ -5073,6 +5076,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
50735076 int llavatokenscounted = 0 ;
50745077 int llavatokensevaled = 0 ;
50755078 int introsize = media_intro.size ();
5079+ int outrosize = media_outro.size ();
50765080 while (input_consumed < embd_inp.size () && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
50775081 {
50785082 if (!last_n_tokens.empty ())
@@ -5162,6 +5166,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
51625166 llavatokensevaled += end_size;
51635167 }
51645168 }
5169+ if (media_objects.size ()>0 && outrosize>0 )
5170+ {
5171+ // added after all media but before prompt
5172+ kcpp_embd_batch batch = kcpp_embd_batch (media_outro, n_past, use_mrope, false );
5173+ auto evr = llama_decode (llama_ctx_v4, batch.batch );
5174+ if (evr!=0 )
5175+ {
5176+ printf (" \n Error when appending media outro: %d\n " ,evr);
5177+ }
5178+ else
5179+ {
5180+ printf (" \r Processing Media Outro (%d tokens)" ,outrosize);
5181+ }
5182+ n_past += outrosize;
5183+ llavatokensevaled += outrosize;
5184+ }
51655185 if (llavatokenscounted!=llavatokensevaled)
51665186 {
51675187 media_composite_image_signature = " " ; // force invalidate
0 commit comments