@@ -438,12 +438,16 @@ struct llama_server_context
438
438
return false ;
439
439
}
440
440
441
- if (params.n_ctx < 2048 ) { // request larger context for the image embedding
441
+ if (params.n_ctx != 0 && params. n_ctx < 2048 ) { // request larger context for the image embedding
442
442
params.n_ctx = 2048 ;
443
443
}
444
444
}
445
445
446
+ // dedicate one sequence to the system prompt
447
+ params.n_parallel += 1 ;
448
+
446
449
std::tie (model, ctx) = llama_init_from_gpt_params (params);
450
+ params.n_parallel -= 1 ; // but be sneaky about it
447
451
if (model == nullptr )
448
452
{
449
453
LOG_ERROR (" unable to load model" , {{" model" , params.model }});
@@ -923,9 +927,9 @@ struct llama_server_context
923
927
}
924
928
925
929
// assign the system KV cache to all parallel sequences
926
- for (int32_t i = 1 ; i < params.n_parallel ; ++i)
930
+ for (int32_t i = 1 ; i <= params.n_parallel ; ++i)
927
931
{
928
- llama_kv_cache_seq_cp (ctx, 0 , i, 0 , system_tokens. size () );
932
+ llama_kv_cache_seq_cp (ctx, 0 , i, - 1 , - 1 );
929
933
}
930
934
}
931
935
@@ -1400,7 +1404,7 @@ struct llama_server_context
1400
1404
std::vector<llama_token> append_tokens = tokenize (json_prompt, false ); // has next image
1401
1405
for (int i = 0 ; i < (int ) append_tokens.size (); ++i)
1402
1406
{
1403
- llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id }, true );
1407
+ llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id + 1 }, true );
1404
1408
slot.n_past += 1 ;
1405
1409
}
1406
1410
}
@@ -1636,8 +1640,8 @@ struct llama_server_context
1636
1640
{" n_system_tokens" , system_tokens.size ()},
1637
1641
{" n_cache_tokens" , slot.cache_tokens .size ()}
1638
1642
});
1639
- llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
1640
- llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
1643
+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1644
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
1641
1645
1642
1646
for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++)
1643
1647
{
@@ -1689,7 +1693,7 @@ struct llama_server_context
1689
1693
1690
1694
// TODO: we always have to take into account the "system_tokens"
1691
1695
// this is not great and needs to be improved somehow
1692
- llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id }, true );
1696
+ llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id + 1 }, true );
1693
1697
slot.n_past += 1 ;
1694
1698
}
1695
1699
@@ -1852,13 +1856,28 @@ struct llama_server_context
1852
1856
}
1853
1857
}
1854
1858
1859
+ // keep only the common part
1855
1860
int p0 = (int ) system_tokens.size () + slot.n_past ;
1856
1861
LOG_INFO (" kv cache rm [p0, end)" , {
1857
1862
{ " slot_id" , slot.id },
1858
1863
{ " task_id" , slot.task_id },
1859
1864
{ " p0" , p0 }
1860
1865
});
1861
- llama_kv_cache_seq_rm (ctx, slot.id , p0, -1 );
1866
+ if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , p0, -1 )) {
1867
+ // could not partially delete (likely using a non-Transformer model)
1868
+ // TODO: logging
1869
+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
1870
+ llama_kv_cache_seq_cp (ctx, 0 , slot.id + 1 , -1 , -1 );
1871
+
1872
+ // there is no common part left (except for the system prompt)
1873
+ // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above
1874
+ slot.n_past = 0 ;
1875
+ slot.n_past_se = 0 ;
1876
+ slot.ga_i = 0 ;
1877
+ slot.num_prompt_tokens_processed = slot.num_prompt_tokens ;
1878
+ // TODO: is the system prompt ever in the sampling context?
1879
+ llama_sampling_reset (slot.ctx_sampling );
1880
+ }
1862
1881
1863
1882
LOG_VERBOSE (" prompt ingested" , {
1864
1883
{" n_past" , slot.n_past },
@@ -1887,7 +1906,7 @@ struct llama_server_context
1887
1906
ga_i += ga_w/ga_n;
1888
1907
}
1889
1908
}
1890
- llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, {slot.id }, false );
1909
+ llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, { slot.id + 1 }, false );
1891
1910
slot_npast++;
1892
1911
}
1893
1912
@@ -1941,9 +1960,9 @@ struct llama_server_context
1941
1960
LOG_TEE (" div: [%6d, %6d] / %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i + ib * bd) / slot.ga_n , (slot.ga_i + ib * bd + slot.ga_w ) / slot.ga_n );
1942
1961
LOG_TEE (" shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd + slot.ga_w , slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
1943
1962
1944
- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i , slot.n_past_se , ib * bd);
1945
- llama_kv_cache_seq_div (ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1946
- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
1963
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i , slot.n_past_se , ib * bd);
1964
+ llama_kv_cache_seq_div (ctx, slot.id + 1 , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1965
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
1947
1966
1948
1967
slot.n_past_se -= bd;
1949
1968
0 commit comments