@@ -1163,7 +1163,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
1163
1163
auto er = llama_decode (llama_ctx_v4, llama_batch_get_one (tmp.data (), tmp.size (), 0 , 0 ));
1164
1164
if (er!=0 )
1165
1165
{
1166
- printf (" \n LLAMA EVAL returned nonzero! \n " );
1166
+ printf (" \n LLAMA EVAL returned nonzero: %d \n " ,er );
1167
1167
}
1168
1168
return ModelLoadResult::SUCCESS;
1169
1169
}
@@ -1806,13 +1806,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
1806
1806
std::vector<int > embd_inp;
1807
1807
std::vector<int > embd_inp_mem; // for storing added memory
1808
1808
std::vector<int > llava_mem; // for storing dummy tokens that will be consumed by llava
1809
+ std::vector<int > llava_sep; // to separate between different llava images
1809
1810
1810
1811
int32_t nctx = kcpp_params->n_ctx ;
1811
1812
1812
1813
TokenizeString (kcpp_params->prompt , embd_inp, file_format);
1813
1814
1814
1815
if (clp_ctx!=nullptr && clp_img_data!=nullptr )
1815
1816
{
1817
+ TokenizeString (" \n\n " , llava_sep, file_format,false );
1818
+ int sepsize = llava_sep.size ();
1819
+
1816
1820
for (int i=0 ;i<llava_images.size ();++i)
1817
1821
{
1818
1822
std::string llava_image = llava_images[i].b64data ;
@@ -1834,11 +1838,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
1834
1838
}
1835
1839
if (llava_images[i].clp_image_tokens >0 && llava_images[i].clp_image_tokens < nctx)
1836
1840
{
1837
- for (int n=0 ;n<llava_images[i].clp_image_tokens ;++n)
1841
+ int tokcnt = (i==0 ?(llava_images[i].clp_image_tokens ):(llava_images[i].clp_image_tokens +sepsize));
1842
+ for (int n=0 ;n<tokcnt;++n)
1838
1843
{
1839
1844
llava_mem.push_back (current_llava_identifier);
1840
1845
}
1841
- }else
1846
+ }
1847
+ else
1842
1848
{
1843
1849
printf (" \n Warning: LLAVA Image excluded - Context size too low or not enough clip tokens!\n " );
1844
1850
}
@@ -2387,6 +2393,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2387
2393
// batch is empty, do image processing
2388
2394
int llavatokenscounted = 0 ;
2389
2395
int llavatokensevaled = 0 ;
2396
+ int sepsize = llava_sep.size ();
2390
2397
while (input_consumed < embd_inp.size () && (embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_A || embd_inp[input_consumed]==LLAVA_TOKEN_IDENTIFIER_B))
2391
2398
{
2392
2399
last_n_tokens.erase (last_n_tokens.begin ());
@@ -2397,6 +2404,22 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2397
2404
}
2398
2405
for (int i=0 ;i<llava_images.size ();++i)
2399
2406
{
2407
+ if (i>0 && sepsize>0 )
2408
+ {
2409
+ // add a separator between each image
2410
+ auto evr = llama_decode (llama_ctx_v4, llama_batch_get_one (llava_sep.data (), sepsize, n_past, 0 ));
2411
+ if (evr!=0 )
2412
+ {
2413
+ printf (" \n Error when appending llava separator: %d\n " ,evr);
2414
+ }
2415
+ else
2416
+ {
2417
+ printf (" \r Processing LLaVa Separator (%d tokens)" ,sepsize);
2418
+ }
2419
+ n_past += sepsize;
2420
+ llavatokensevaled += sepsize;
2421
+ }
2422
+
2400
2423
if (allow_regular_prints)
2401
2424
{
2402
2425
printf (" \r Processing LLaVa Embedding %d (%d tokens)" ,(i+1 ), llava_images[i].clp_image_tokens );
0 commit comments