@@ -3009,13 +3009,12 @@ int GetThreadsToUse(bool blasmode)
3009
3009
}
3010
3010
3011
3011
// this function prepares the clip embds for llava. it's only needed when images change
3012
- static void PrepareMediaEmbds (const int nctx, const std::vector<int > & media_sep, const std::vector< int > & media_intro)
3012
+ static void PrepareMediaEmbds (const int nctx, const std::vector<int > & media_intro)
3013
3013
{
3014
3014
bool vision_on = (clp_ctx_v != nullptr && clp_img_data != nullptr );
3015
3015
bool audio_on = (clp_ctx_a != nullptr );
3016
3016
if (vision_on || audio_on)
3017
3017
{
3018
- int sepsize = media_sep.size ();
3019
3018
int introsize = media_intro.size ();
3020
3019
last_media_mem.clear ();
3021
3020
@@ -3048,7 +3047,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
3048
3047
int cliptokensneeded = chunk.clp_image_tokens ;
3049
3048
if (cliptokensneeded>0 && cliptokensneeded < nctx)
3050
3049
{
3051
- int tokcnt = (i== 0 ?( chunk.clp_image_tokens ):(chunk. clp_image_tokens +sepsize ));
3050
+ int tokcnt = (chunk.clp_image_tokens + media_objects[i]. chunk_start_seq . size () + media_objects[i]. chunk_end_seq . size ( ));
3052
3051
if (i==0 )
3053
3052
{
3054
3053
tokcnt += introsize;
@@ -3101,7 +3100,7 @@ static void PrepareMediaEmbds(const int nctx, const std::vector<int> & media_sep
3101
3100
int cliptokensneeded = total_chunk_tokens;
3102
3101
if (cliptokensneeded>0 && cliptokensneeded < nctx)
3103
3102
{
3104
- int tokcnt = (i== 0 ?(cliptokensneeded):(cliptokensneeded+sepsize ));
3103
+ int tokcnt = (cliptokensneeded + media_objects[i]. chunk_start_seq . size () + media_objects[i]. chunk_end_seq . size ( ));
3105
3104
if (i==0 )
3106
3105
{
3107
3106
tokcnt += introsize;
@@ -3289,6 +3288,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3289
3288
media_object lv;
3290
3289
lv.b64data = item;
3291
3290
lv.is_audio = false ;
3291
+ TokenizeString (" <image>" , lv.chunk_start_seq , file_format, false );
3292
+ TokenizeString (" </image>\n\n " , lv.chunk_end_seq , file_format, false );
3292
3293
media_objects.push_back (lv);
3293
3294
new_media_composite += item;
3294
3295
}
@@ -3301,6 +3302,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3301
3302
media_object lv;
3302
3303
lv.b64data = item;
3303
3304
lv.is_audio = true ;
3305
+ TokenizeString (" <audio>" , lv.chunk_start_seq , file_format, false );
3306
+ TokenizeString (" </audio>\n\n " , lv.chunk_end_seq , file_format, false );
3304
3307
media_objects.push_back (lv);
3305
3308
new_media_composite += item;
3306
3309
}
@@ -3473,16 +3476,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3473
3476
// tokenize the prompt
3474
3477
std::vector<int > embd_inp;
3475
3478
std::vector<int > embd_inp_mem; // for storing added memory
3476
- std::vector<int > media_sep; // to separate between different llava images
3477
- std::vector<int > media_intro; // to separate between different llava images
3479
+ std::vector<int > media_intro; // added before media list
3478
3480
std::vector<int > guidance_embd; // holds the guidance prompt
3479
3481
bool media_embds_built = false ;
3480
3482
3481
3483
int32_t nctx = kcpp_data->n_ctx ;
3482
3484
3483
3485
TokenizeString (kcpp_data->prompt , embd_inp, file_format, add_bos_token);
3484
3486
bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
3485
- TokenizeString (" \n\n " , media_sep, file_format, false );
3486
3487
TokenizeString (" \n Attached Media:\n " , media_intro, file_format, false );
3487
3488
3488
3489
if (media_composite_image_signature==" " )
@@ -3491,7 +3492,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3491
3492
}
3492
3493
if (media_data_changed)
3493
3494
{
3494
- PrepareMediaEmbds (nctx, media_sep, media_intro);
3495
+ PrepareMediaEmbds (nctx, media_intro);
3495
3496
media_embds_built = true ;
3496
3497
}
3497
3498
@@ -4263,7 +4264,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
4263
4264
{
4264
4265
if (!media_embds_built) // this should never happen! however, handle it anyway
4265
4266
{
4266
- PrepareMediaEmbds (nctx, media_sep, media_intro);
4267
+ PrepareMediaEmbds (nctx, media_intro);
4267
4268
media_embds_built = true ;
4268
4269
printf (" \n Somehow vision embd was not prepared (maybe no fast forward), rebuilding it...\n " );
4269
4270
}
@@ -4278,7 +4279,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
4278
4279
// batch is empty, do image processing
4279
4280
int llavatokenscounted = 0 ;
4280
4281
int llavatokensevaled = 0 ;
4281
- int sepsize = media_sep.size ();
4282
4282
int introsize = media_intro.size ();
4283
4283
while (input_consumed < embd_inp.size () && (embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==MEDIA_TOKEN_IDENTIFIER_B))
4284
4284
{
@@ -4310,21 +4310,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
4310
4310
n_past += introsize;
4311
4311
llavatokensevaled += introsize;
4312
4312
}
4313
- if (sepsize>0 && i>0 )
4314
- {
4313
+
4314
+ int start_size = media_objects[i].chunk_start_seq .size ();
4315
+ if (start_size > 0 ) {
4315
4316
// add a separator between each image
4316
- kcpp_embd_batch batch = kcpp_embd_batch (media_sep , n_past, use_mrope, false );
4317
+ kcpp_embd_batch batch = kcpp_embd_batch (media_objects[i]. chunk_start_seq , n_past, use_mrope, false );
4317
4318
auto evr = llama_decode (llama_ctx_v4, batch.batch );
4318
4319
if (evr!=0 )
4319
4320
{
4320
4321
printf (" \n Error when appending media separator: %d\n " ,evr);
4321
4322
}
4322
4323
else
4323
4324
{
4324
- printf (" \r Processing Media Separator (%d tokens)" ,sepsize );
4325
+ printf (" \r Processing Media Start Separator (%d tokens)" ,start_size );
4325
4326
}
4326
- n_past += sepsize ;
4327
- llavatokensevaled += sepsize ;
4327
+ n_past += start_size ;
4328
+ llavatokensevaled += start_size ;
4328
4329
}
4329
4330
4330
4331
for (int j=0 ;j<media_objects[i].mediachunks .size ();++j)
@@ -4348,6 +4349,23 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
4348
4349
return output;
4349
4350
}
4350
4351
}
4352
+
4353
+ int end_size = media_objects[i].chunk_end_seq .size ();
4354
+ if (end_size > 0 ) {
4355
+ // add a separator between each image
4356
+ kcpp_embd_batch batch = kcpp_embd_batch (media_objects[i].chunk_end_seq , n_past, use_mrope, false );
4357
+ auto evr = llama_decode (llama_ctx_v4, batch.batch );
4358
+ if (evr!=0 )
4359
+ {
4360
+ printf (" \n Error when appending media separator: %d\n " ,evr);
4361
+ }
4362
+ else
4363
+ {
4364
+ printf (" \r Processing Media End Separator (%d tokens)" ,end_size);
4365
+ }
4366
+ n_past += end_size;
4367
+ llavatokensevaled += end_size;
4368
+ }
4351
4369
}
4352
4370
if (llavatokenscounted!=llavatokensevaled)
4353
4371
{
0 commit comments