Skip to content

Commit f0564f9

Browse files
committed
updated lite, added better separators for multimodal chunks (universal)
1 parent 2a59adc commit f0564f9

File tree

3 files changed

+163
-92
lines changed

3 files changed

+163
-92
lines changed

gpttype_adapter.cpp

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3009,13 +3009,12 @@ int GetThreadsToUse(bool blasmode)
30093009
}
30103010

30113011
//this function prepares the clip embds for llava. it's only needed when images change
3012-
static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep, const std::vector<int> & media_intro)
3012+
static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_intro)
30133013
{
30143014
bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr);
30153015
bool audio_on = (clp_ctx_a != nullptr);
30163016
if (vision_on || audio_on)
30173017
{
3018-
int sepsize = media_sep.size();
30193018
int introsize = media_intro.size();
30203019
last_media_mem.clear();
30213020

@@ -3048,7 +3047,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
30483047
int cliptokensneeded = chunk.clp_image_tokens;
30493048
if(cliptokensneeded>0 && cliptokensneeded < nctx)
30503049
{
3051-
int tokcnt = (i==0?(chunk.clp_image_tokens):(chunk.clp_image_tokens+sepsize));
3050+
int tokcnt = (chunk.clp_image_tokens + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
30523051
if(i==0)
30533052
{
30543053
tokcnt += introsize;
@@ -3101,7 +3100,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
31013100
int cliptokensneeded = total_chunk_tokens;
31023101
if(cliptokensneeded>0 && cliptokensneeded < nctx)
31033102
{
3104-
int tokcnt = (i==0?(cliptokensneeded):(cliptokensneeded+sepsize));
3103+
int tokcnt = (cliptokensneeded + media_objects[i].chunk_start_seq.size() + media_objects[i].chunk_end_seq.size());
31053104
if(i==0)
31063105
{
31073106
tokcnt += introsize;
@@ -3289,6 +3288,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
32893288
media_object lv;
32903289
lv.b64data = item;
32913290
lv.is_audio = false;
3291+
TokenizeString("<image>", lv.chunk_start_seq, file_format, false);
3292+
TokenizeString("</image>\n\n", lv.chunk_end_seq, file_format, false);
32923293
media_objects.push_back(lv);
32933294
new_media_composite += item;
32943295
}
@@ -3301,6 +3302,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
33013302
media_object lv;
33023303
lv.b64data = item;
33033304
lv.is_audio = true;
3305+
TokenizeString("<audio>", lv.chunk_start_seq, file_format, false);
3306+
TokenizeString("</audio>\n\n", lv.chunk_end_seq, file_format, false);
33043307
media_objects.push_back(lv);
33053308
new_media_composite += item;
33063309
}
@@ -3473,16 +3476,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34733476
// tokenize the prompt
34743477
std::vector<int> embd_inp;
34753478
std::vector<int> embd_inp_mem; //for storing added memory
3476-
std::vector<int> media_sep; //to separate between different llava images
3477-
std::vector<int> media_intro; //to separate between different llava images
3479+
std::vector<int> media_intro; //added before media list
34783480
std::vector<int> guidance_embd; //holds the guidance prompt
34793481
bool media_embds_built = false;
34803482

34813483
int32_t nctx = kcpp_data->n_ctx;
34823484

34833485
TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
34843486
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
3485-
TokenizeString("\n\n", media_sep, file_format, false);
34863487
TokenizeString("\nAttached Media:\n", media_intro, file_format, false);
34873488

34883489
if(media_composite_image_signature=="")
@@ -3491,7 +3492,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
34913492
}
34923493
if(media_data_changed)
34933494
{
3494-
PrepareMediaEmbds(nctx, media_sep, media_intro);
3495+
PrepareMediaEmbds(nctx, media_intro);
34953496
media_embds_built = true;
34963497
}
34973498

@@ -4263,7 +4264,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
42634264
{
42644265
if(!media_embds_built) //this should never happen! however, handle it anyway
42654266
{
4266-
PrepareMediaEmbds(nctx, media_sep, media_intro);
4267+
PrepareMediaEmbds(nctx, media_intro);
42674268
media_embds_built = true;
42684269
printf("\nSomehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n");
42694270
}
@@ -4278,7 +4279,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
42784279
//batch is empty, do image processing
42794280
int llavatokenscounted = 0;
42804281
int llavatokensevaled = 0;
4281-
int sepsize = media_sep.size();
42824282
int introsize = media_intro.size();
42834283
while(input_consumed < embd_inp.size() && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
42844284
{
@@ -4310,21 +4310,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
43104310
n_past += introsize;
43114311
llavatokensevaled += introsize;
43124312
}
4313-
if(sepsize>0 && i>0)
4314-
{
4313+
4314+
int start_size = media_objects[i].chunk_start_seq.size();
4315+
if (start_size > 0) {
43154316
//add a separator between each image
4316-
kcpp_embd_batch batch = kcpp_embd_batch(media_sep, n_past, use_mrope, false);
4317+
kcpp_embd_batch batch = kcpp_embd_batch(media_objects[i].chunk_start_seq, n_past, use_mrope, false);
43174318
auto evr = llama_decode(llama_ctx_v4, batch.batch);
43184319
if(evr!=0)
43194320
{
43204321
printf("\nError when appending media separator: %d\n",evr);
43214322
}
43224323
else
43234324
{
4324-
printf("\rProcessing Media Separator (%d tokens)",sepsize);
4325+
printf("\rProcessing Media Start Separator (%d tokens)",start_size);
43254326
}
4326-
n_past += sepsize;
4327-
llavatokensevaled += sepsize;
4327+
n_past += start_size;
4328+
llavatokensevaled += start_size;
43284329
}
43294330

43304331
for(int j=0;j<media_objects[i].mediachunks.size();++j)
@@ -4348,6 +4349,23 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
43484349
return output;
43494350
}
43504351
}
4352+
4353+
int end_size = media_objects[i].chunk_end_seq.size();
4354+
if (end_size > 0) {
4355+
//add a separator between each image
4356+
kcpp_embd_batch batch = kcpp_embd_batch(media_objects[i].chunk_end_seq, n_past, use_mrope, false);
4357+
auto evr = llama_decode(llama_ctx_v4, batch.batch);
4358+
if(evr!=0)
4359+
{
4360+
printf("\nError when appending media separator: %d\n",evr);
4361+
}
4362+
else
4363+
{
4364+
printf("\rProcessing Media End Separator (%d tokens)",end_size);
4365+
}
4366+
n_past += end_size;
4367+
llavatokensevaled += end_size;
4368+
}
43514369
}
43524370
if(llavatokenscounted!=llavatokensevaled)
43534371
{

0 commit comments

Comments
 (0)