@@ -377,12 +377,16 @@ struct llama_server_context
377
377
return false ;
378
378
}
379
379
380
- if (params.n_ctx < 2048 ) { // request larger context for the image embedding
380
+ if (params.n_ctx != 0 && params. n_ctx < 2048 ) { // request larger context for the image embedding
381
381
params.n_ctx = 2048 ;
382
382
}
383
383
}
384
384
385
+ // dedicate one sequence to the system prompt
386
+ params.n_parallel += 1 ;
387
+
385
388
std::tie (model, ctx) = llama_init_from_gpt_params (params);
389
+ params.n_parallel -= 1 ; // but be sneaky about it
386
390
if (model == nullptr )
387
391
{
388
392
LOG_ERROR (" unable to load model" , {{" model" , params.model }});
@@ -862,9 +866,9 @@ struct llama_server_context
862
866
}
863
867
864
868
// assign the system KV cache to all parallel sequences
865
- for (int32_t i = 1 ; i < params.n_parallel ; ++i)
869
+ for (int32_t i = 1 ; i <= params.n_parallel ; ++i)
866
870
{
867
- llama_kv_cache_seq_cp (ctx, 0 , i, 0 , system_tokens. size () );
871
+ llama_kv_cache_seq_cp (ctx, 0 , i, - 1 , - 1 );
868
872
}
869
873
}
870
874
@@ -1351,7 +1355,7 @@ struct llama_server_context
1351
1355
std::vector<llama_token> append_tokens = tokenize (json_prompt, false ); // has next image
1352
1356
for (int i = 0 ; i < (int ) append_tokens.size (); ++i)
1353
1357
{
1354
- llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id }, true );
1358
+ llama_batch_add (batch, append_tokens[i], system_tokens.size () + slot.n_past , { slot.id + 1 }, true );
1355
1359
slot.n_past += 1 ;
1356
1360
}
1357
1361
}
@@ -1587,8 +1591,8 @@ struct llama_server_context
1587
1591
{" n_system_tokens" , system_tokens.size ()},
1588
1592
{" n_cache_tokens" , slot.cache_tokens .size ()}
1589
1593
});
1590
- llama_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
1591
- llama_kv_cache_seq_add (ctx, slot.id , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
1594
+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , n_keep , n_keep + n_discard);
1595
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , n_keep + n_discard, system_tokens.size () + slot.n_past , -n_discard);
1592
1596
1593
1597
for (size_t i = n_keep + n_discard; i < slot.cache_tokens .size (); i++)
1594
1598
{
@@ -1640,7 +1644,7 @@ struct llama_server_context
1640
1644
1641
1645
// TODO: we always have to take into account the "system_tokens"
1642
1646
// this is not great and needs to be improved somehow
1643
- llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id }, true );
1647
+ llama_batch_add (batch, slot.sampled , system_tokens.size () + slot_npast, { slot.id + 1 }, true );
1644
1648
slot.n_past += 1 ;
1645
1649
}
1646
1650
@@ -1808,13 +1812,28 @@ struct llama_server_context
1808
1812
}
1809
1813
}
1810
1814
1815
+ // keep only the common part
1811
1816
int p0 = (int ) system_tokens.size () + slot.n_past ;
1812
1817
LOG_INFO (" kv cache rm [p0, end)" , {
1813
1818
{ " slot_id" , slot.id },
1814
1819
{ " task_id" , slot.task_id },
1815
1820
{ " p0" , p0 }
1816
1821
});
1817
- llama_kv_cache_seq_rm (ctx, slot.id , p0, -1 );
1822
+ if (!llama_kv_cache_seq_rm (ctx, slot.id + 1 , p0, -1 )) {
1823
+ // could not partially delete (likely using a non-Transformer model)
1824
+ // TODO: logging
1825
+ llama_kv_cache_seq_rm (ctx, slot.id + 1 , -1 , -1 );
1826
+ llama_kv_cache_seq_cp (ctx, 0 , slot.id + 1 , -1 , -1 );
1827
+
1828
+ // there is no common part left (except for the system prompt)
1829
+ // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above
1830
+ slot.n_past = 0 ;
1831
+ slot.n_past_se = 0 ;
1832
+ slot.ga_i = 0 ;
1833
+ slot.n_prompt_tokens_processed = slot.n_prompt_tokens ;
1834
+ // TODO: is the system prompt ever in the sampling context?
1835
+ llama_sampling_reset (slot.ctx_sampling );
1836
+ }
1818
1837
1819
1838
LOG_VERBOSE (" prompt ingested" , {
1820
1839
{" n_past" , slot.n_past },
@@ -1843,7 +1862,7 @@ struct llama_server_context
1843
1862
ga_i += ga_w/ga_n;
1844
1863
}
1845
1864
}
1846
- llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, {slot.id }, false );
1865
+ llama_batch_add (batch, prefix_tokens[slot.n_past ], system_tokens.size () + slot_npast, { slot.id + 1 }, false );
1847
1866
slot_npast++;
1848
1867
}
1849
1868
@@ -1897,9 +1916,9 @@ struct llama_server_context
1897
1916
LOG_TEE (" div: [%6d, %6d] / %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w , slot.ga_n , (slot.ga_i + ib * bd) / slot.ga_n , (slot.ga_i + ib * bd + slot.ga_w ) / slot.ga_n );
1898
1917
LOG_TEE (" shift: [%6d, %6d] + %6d -> [%6d, %6d]\n " , slot.ga_i + ib * bd + slot.ga_w , slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
1899
1918
1900
- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i , slot.n_past_se , ib * bd);
1901
- llama_kv_cache_seq_div (ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1902
- llama_kv_cache_seq_add (ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
1919
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i , slot.n_past_se , ib * bd);
1920
+ llama_kv_cache_seq_div (ctx, slot.id + 1 , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w ,slot.ga_n );
1921
+ llama_kv_cache_seq_add (ctx, slot.id + 1 , slot.ga_i + ib * bd + slot.ga_w ,slot.n_past_se + ib * bd, dd);
1903
1922
1904
1923
slot.n_past_se -= bd;
1905
1924
0 commit comments