Skip to content

Commit 919a010

Browse files
committed
support outro but don't actually use it yet
1 parent 9b95ade commit 919a010

1 file changed

Lines changed: 27 additions & 7 deletions

File tree

gpttype_adapter.cpp

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3332,13 +3332,14 @@ int GetThreadsToUse(bool blasmode)
33323332
}
33333333

33343334
//this function prepares the clip embds for llava. it's only needed when images change
3335-
static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro)
3335+
static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro, const std::vector<int> & media_outro)
33363336
{
33373337
bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr);
33383338
bool audio_on = (clp_ctx_a != nullptr);
33393339
if (vision_on || audio_on)
33403340
{
33413341
int introsize = media_intro.size();
3342+
int outrosize = media_outro.size();
33423343
last_media_mem.clear();
33433344

33443345
for(int i=0;i<media_objects.size();++i)
@@ -3373,7 +3374,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
33733374
int tokcnt = (chunk.clp_image_tokens + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
33743375
if(i==0)
33753376
{
3376-
tokcnt += introsize;
3377+
tokcnt += introsize + outrosize;
33773378
}
33783379
for(int n=0;n<tokcnt;++n)
33793380
{
@@ -3425,7 +3426,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_int
34253426
int tokcnt = (cliptokensneeded + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
34263427
if(i==0)
34273428
{
3428-
tokcnt += introsize;
3429+
tokcnt += introsize + outrosize;
34293430
}
34303431
for(int n=0;n<tokcnt;++n)
34313432
{
@@ -3627,6 +3628,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
36273628
std::string addedmemory = inputs.memory;
36283629
std::string negative_prompt = inputs.negative_prompt;
36293630

3631+
std::vector<int> media_intro; //added before media list
3632+
std::vector<int> media_outro; //added before media list
3633+
TokenizeString("\nAttached Media:\n", media_intro, file_format, true);
3634+
36303635
//clear previous run llava embd memory, just-in-time free
36313636
for(int i=0;i<media_objects.size();++i)
36323637
{
@@ -3918,22 +3923,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
39183923
// tokenize the prompt
39193924
std::vector<int> embd_inp;
39203925
std::vector<int> embd_inp_mem; //for storing added memory
3921-
std::vector<int> media_intro; //added before media list
39223926
std::vector<int> guidance_embd; //holds the guidance prompt
39233927
bool media_embds_built = false;
39243928

39253929
int32_t nctx = kcpp_data->n_ctx;
39263930

39273931
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
3928-
TokenizeString("\nAttached Media:\n", media_intro, file_format, true);
39293932

39303933
if(media_composite_image_signature=="")
39313934
{
39323935
last_media_mem.clear();
39333936
}
39343937
if(media_data_changed)
39353938
{
3936-
PrepareMediaEmbds(nctx, media_intro);
3939+
PrepareMediaEmbds(nctx, media_intro, media_outro);
39373940
media_embds_built = true;
39383941
}
39393942

@@ -5057,7 +5060,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
50575060
{
50585061
if(!media_embds_built) //this should never happen! however, handle it anyway
50595062
{
5060-
PrepareMediaEmbds(nctx, media_intro);
5063+
PrepareMediaEmbds(nctx, media_intro, media_outro);
50615064
media_embds_built = true;
50625065
printf("\nSomehow media embeds was not prepared (maybe no fast forward), rebuilding it...\n");
50635066
}
@@ -5073,6 +5076,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
50735076
int llavatokenscounted = 0;
50745077
int llavatokensevaled = 0;
50755078
int introsize = media_intro.size();
5079+
int outrosize = media_outro.size();
50765080
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
50775081
{
50785082
if (!last_n_tokens.empty())
@@ -5162,6 +5166,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
51625166
llavatokensevaled += end_size;
51635167
}
51645168
}
5169+
if(media_objects.size()>0 && outrosize>0)
5170+
{
5171+
//added after all media but before prompt
5172+
kcpp_embd_batch batch = kcpp_embd_batch(media_outro, n_past, use_mrope, false);
5173+
auto evr = llama_decode(llama_ctx_v4, batch.batch);
5174+
if(evr!=0)
5175+
{
5176+
printf("\nError when appending media outro: %d\n",evr);
5177+
}
5178+
else
5179+
{
5180+
printf("\rProcessing Media Outro (%d tokens)",outrosize);
5181+
}
5182+
n_past += outrosize;
5183+
llavatokensevaled += outrosize;
5184+
}
51655185
if(llavatokenscounted!=llavatokensevaled)
51665186
{
51675187
media_composite_image_signature = ""; //force invalidate

0 commit comments

Comments
 (0)